@aws/ml-container-creator 0.9.1 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +2049 -0
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +53 -68
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +138 -138
- package/servers/instance-sizer/lib/instance-ranker.js +76 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/jumpstart-public.json +101 -16
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/catalogs/models.json +182 -26
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +4 -390
- package/src/lib/bootstrap-command-handler.js +710 -1148
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +641 -0
- package/src/lib/bootstrap-provisioners.js +421 -0
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +408 -0
- package/src/lib/config-manager.js +66 -1685
- package/src/lib/config-mcp-client.js +118 -0
- package/src/lib/config-validator.js +634 -0
- package/src/lib/cuda-resolver.js +149 -0
- package/src/lib/e2e-catalog-validator.js +251 -3
- package/src/lib/e2e-ci-recorder.js +103 -0
- package/src/lib/generated/cli-options.js +315 -311
- package/src/lib/generated/parameter-matrix.js +671 -0
- package/src/lib/generated/validation-rules.js +71 -71
- package/src/lib/marketplace-flow.js +276 -0
- package/src/lib/mcp-query-runner.js +768 -0
- package/src/lib/parameter-schema-validator.js +62 -18
- package/src/lib/path-prover-brain.js +607 -0
- package/src/lib/prompt-runner.js +41 -1504
- package/src/lib/prompts/feature-prompts.js +172 -0
- package/src/lib/prompts/index.js +48 -0
- package/src/lib/prompts/infrastructure-prompts.js +690 -0
- package/src/lib/prompts/model-prompts.js +552 -0
- package/src/lib/prompts/project-prompts.js +82 -0
- package/src/lib/prompts.js +2 -1446
- package/src/lib/registry-command-handler.js +135 -3
- package/src/lib/secrets-prompt-runner.js +251 -0
- package/src/lib/template-variable-resolver.js +422 -0
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +149 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/config +108 -5
- package/templates/do/deploy.d/managed-inference.ejs +192 -11
- package/templates/do/optimize +106 -37
- package/templates/do/register +89 -0
- package/templates/do/test +13 -0
- package/templates/do/tune +378 -59
- package/templates/do/validate +44 -4
- package/config/parameter-schema.json +0 -88
package/templates/do/benchmark
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
1
|
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
3
|
|
|
@@ -19,22 +18,27 @@ CLEAN_AFTER=false
|
|
|
19
18
|
FORCE=false
|
|
20
19
|
IC_ARG=""
|
|
21
20
|
ADAPTER_ARG=""
|
|
21
|
+
ARG_NO_STALE_WARNING=false
|
|
22
|
+
ARG_WORKLOAD=""
|
|
22
23
|
while [ $# -gt 0 ]; do
|
|
23
24
|
case "$1" in
|
|
24
25
|
--clean) CLEAN_AFTER=true; shift ;;
|
|
25
26
|
--force) FORCE=true; shift ;;
|
|
27
|
+
--no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
|
|
28
|
+
--workload) shift; ARG_WORKLOAD="${1:-}"; shift ;;
|
|
26
29
|
--ic) shift; IC_ARG="${1:-}"; shift ;;
|
|
27
30
|
--adapter) shift; ADAPTER_ARG="${1:-}"; shift ;;
|
|
28
31
|
--help|-h)
|
|
29
|
-
echo "Usage: ./do/benchmark [--ic <name>] [--adapter <name>] [--force] [--clean]"
|
|
32
|
+
echo "Usage: ./do/benchmark [--workload <name>] [--ic <name>] [--adapter <name>] [--force] [--clean] [--no-stale-warning]"
|
|
30
33
|
echo ""
|
|
31
34
|
echo "Run SageMaker AI Benchmark against the deployed endpoint."
|
|
32
35
|
echo ""
|
|
33
36
|
echo "Options:"
|
|
34
|
-
echo " --ic <name>
|
|
35
|
-
echo " --adapter <name>
|
|
36
|
-
echo " --force
|
|
37
|
-
echo " --clean
|
|
37
|
+
echo " --ic <name> Benchmark a specific inference component"
|
|
38
|
+
echo " --adapter <name> Benchmark a specific LoRA adapter IC"
|
|
39
|
+
echo " --force Create a new benchmark job even if one is already running"
|
|
40
|
+
echo " --clean Delete workload config and benchmark job after displaying results"
|
|
41
|
+
echo " --no-stale-warning Suppress schema registry staleness warning"
|
|
38
42
|
echo ""
|
|
39
43
|
echo "IC resolution:"
|
|
40
44
|
echo " --adapter <name> Use ADAPTER_IC_NAME from do/adapters/<name>.conf"
|
|
@@ -54,6 +58,433 @@ while [ $# -gt 0 ]; do
|
|
|
54
58
|
esac
|
|
55
59
|
done
|
|
56
60
|
|
|
61
|
+
|
|
62
|
+
# ── Require --workload flag ───────────────────────────────────────────────────
|
|
63
|
+
if [ -z "${ARG_WORKLOAD}" ]; then
|
|
64
|
+
echo "❌ --workload <name> is required"
|
|
65
|
+
echo ""
|
|
66
|
+
# List available workloads from the MCP catalog
|
|
67
|
+
_CATALOG_FOR_HELP=""
|
|
68
|
+
if command -v npm &>/dev/null; then
|
|
69
|
+
_NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
|
|
70
|
+
if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
|
|
71
|
+
_CATALOG_FOR_HELP="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
|
|
72
|
+
fi
|
|
73
|
+
fi
|
|
74
|
+
if [ -n "${_CATALOG_FOR_HELP}" ]; then
|
|
75
|
+
echo " Available workloads:"
|
|
76
|
+
python3 -c "
|
|
77
|
+
import json
|
|
78
|
+
with open('${_CATALOG_FOR_HELP}') as f:
|
|
79
|
+
catalog = json.load(f)
|
|
80
|
+
for name, wl in catalog.get('workloads', {}).items():
|
|
81
|
+
print(f' {name:30s} {wl.get("description", "")[:50]}')
|
|
82
|
+
" 2>/dev/null || echo " (could not read workload catalog)"
|
|
83
|
+
else
|
|
84
|
+
echo " Run 'ml-container-creator mcp init' to install workload profiles"
|
|
85
|
+
fi
|
|
86
|
+
echo ""
|
|
87
|
+
echo " Usage: ./do/benchmark --workload multi_turn_chat"
|
|
88
|
+
exit 1
|
|
89
|
+
fi
|
|
90
|
+
|
|
91
|
+
# ── Workload Resolution (from workload-picker MCP server catalog) ─────────────
|
|
92
|
+
# If --workload is passed with a named workload (not "manual"), resolve
|
|
93
|
+
# the workload parameters from the MCP server's catalog file. This overrides
|
|
94
|
+
# BENCHMARK_INPUT_TOKENS_MEAN, BENCHMARK_OUTPUT_TOKENS_MEAN, BENCHMARK_STREAMING,
|
|
95
|
+
# and BENCHMARK_CONCURRENCY_LEVELS from do/config.
|
|
96
|
+
BENCHMARK_WORKLOAD="${ARG_WORKLOAD:-manual}"
|
|
97
|
+
|
|
98
|
+
if [ "${BENCHMARK_WORKLOAD}" != "manual" ]; then
|
|
99
|
+
# Locate the workload catalog (npm global or local)
|
|
100
|
+
_WORKLOAD_CATALOG=""
|
|
101
|
+
if [ -f "$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
|
|
102
|
+
_WORKLOAD_CATALOG="$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
|
|
103
|
+
elif command -v npm &>/dev/null; then
|
|
104
|
+
_NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
|
|
105
|
+
if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
|
|
106
|
+
_WORKLOAD_CATALOG="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
|
|
107
|
+
fi
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
if [ -n "${_WORKLOAD_CATALOG}" ]; then
|
|
111
|
+
_WL_PARAMS=$(python3 -c "
|
|
112
|
+
import json, sys
|
|
113
|
+
with open('${_WORKLOAD_CATALOG}') as f:
|
|
114
|
+
catalog = json.load(f)
|
|
115
|
+
wl = catalog.get('workloads', {}).get('${BENCHMARK_WORKLOAD}')
|
|
116
|
+
if wl:
|
|
117
|
+
print(json.dumps(wl))
|
|
118
|
+
else:
|
|
119
|
+
print('null')
|
|
120
|
+
" 2>/dev/null) || _WL_PARAMS="null"
|
|
121
|
+
|
|
122
|
+
if [ "${_WL_PARAMS}" != "null" ] && [ -n "${_WL_PARAMS}" ]; then
|
|
123
|
+
echo "📋 Workload profile: ${BENCHMARK_WORKLOAD}"
|
|
124
|
+
BENCHMARK_INPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['input_tokens_mean'])")
|
|
125
|
+
BENCHMARK_OUTPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['output_tokens_mean'])")
|
|
126
|
+
BENCHMARK_STREAMING=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(str(json.load(sys.stdin)['streaming']).lower())")
|
|
127
|
+
# Set concurrency levels for multi-level mode if not already overridden
|
|
128
|
+
if [ -z "${BENCHMARK_CONCURRENCY_LEVELS:-}" ]; then
|
|
129
|
+
BENCHMARK_CONCURRENCY_LEVELS=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(','.join(str(x) for x in json.load(sys.stdin)['concurrency_levels']))")
|
|
130
|
+
fi
|
|
131
|
+
# Also override single-level BENCHMARK_CONCURRENCY with first level from workload
|
|
132
|
+
BENCHMARK_CONCURRENCY=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['concurrency_levels'][0])")
|
|
133
|
+
echo " Input tokens: ${BENCHMARK_INPUT_TOKENS_MEAN}, Output tokens: ${BENCHMARK_OUTPUT_TOKENS_MEAN}"
|
|
134
|
+
echo " Streaming: ${BENCHMARK_STREAMING}, Concurrency: ${BENCHMARK_CONCURRENCY_LEVELS:-${BENCHMARK_CONCURRENCY}}"
|
|
135
|
+
echo ""
|
|
136
|
+
else
|
|
137
|
+
echo "⚠️ Unknown workload '${BENCHMARK_WORKLOAD}' — using do/config defaults"
|
|
138
|
+
fi
|
|
139
|
+
else
|
|
140
|
+
echo "⚠️ Workload catalog not found — using do/config defaults"
|
|
141
|
+
fi
|
|
142
|
+
fi
|
|
143
|
+
|
|
144
|
+
# ── Resolve profile-level values ──────────────────────────────────────────────
|
|
145
|
+
# Read S3 buckets and account info from the bootstrap profile
|
|
146
|
+
_PROFILE_JSON=""
|
|
147
|
+
if command -v python3 &>/dev/null; then
|
|
148
|
+
_PROFILE_JSON=$(python3 -c "
|
|
149
|
+
import json, os
|
|
150
|
+
config_path = os.path.expanduser('~/.ml-container-creator/config.json')
|
|
151
|
+
try:
|
|
152
|
+
with open(config_path) as f:
|
|
153
|
+
config = json.load(f)
|
|
154
|
+
profile = config['profiles'][config['activeProfile']]
|
|
155
|
+
print(json.dumps(profile))
|
|
156
|
+
except:
|
|
157
|
+
print('{}')
|
|
158
|
+
" 2>/dev/null) || _PROFILE_JSON="{}"
|
|
159
|
+
fi
|
|
160
|
+
|
|
161
|
+
# Extract benchmark-relevant profile values
|
|
162
|
+
BENCHMARK_S3_OUTPUT_PATH=$(echo "${_PROFILE_JSON}" | python3 -c "
|
|
163
|
+
import sys, json
|
|
164
|
+
p = json.load(sys.stdin)
|
|
165
|
+
bucket = p.get('benchmarkS3Bucket', '')
|
|
166
|
+
if not bucket:
|
|
167
|
+
acct = p.get('accountId', 'unknown')
|
|
168
|
+
region = p.get('awsRegion', 'us-east-1')
|
|
169
|
+
bucket = f'mlcc-benchmark-{acct}-{region}'
|
|
170
|
+
print(f's3://{bucket}/${PROJECT_NAME}/')
|
|
171
|
+
" 2>/dev/null) || BENCHMARK_S3_OUTPUT_PATH=""
|
|
172
|
+
|
|
173
|
+
CI_BENCHMARK_RESULTS_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ciBenchmarkResultsBucket', ''))" 2>/dev/null) || CI_BENCHMARK_RESULTS_BUCKET=""
|
|
174
|
+
|
|
175
|
+
# Derive job names at runtime (unique per invocation)
|
|
176
|
+
BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
|
|
177
|
+
BENCHMARK_WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
|
|
178
|
+
|
|
179
|
+
# Ensure benchmark params have defaults (in case workload catalog wasn't found)
|
|
180
|
+
BENCHMARK_CONCURRENCY=${BENCHMARK_CONCURRENCY:-10}
|
|
181
|
+
BENCHMARK_INPUT_TOKENS_MEAN=${BENCHMARK_INPUT_TOKENS_MEAN:-550}
|
|
182
|
+
BENCHMARK_OUTPUT_TOKENS_MEAN=${BENCHMARK_OUTPUT_TOKENS_MEAN:-150}
|
|
183
|
+
BENCHMARK_STREAMING=${BENCHMARK_STREAMING:-true}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# ── Multi-level concurrency support (CI Stage 2) ─────────────────────────────
|
|
188
|
+
# When BENCHMARK_CONCURRENCY_LEVELS is set (comma-separated integers, e.g. "1,4,8"
|
|
189
|
+
# or JSON array string, e.g. "[1,4,8]"), and we are NOT already in single-level
|
|
190
|
+
# execution mode (_BENCHMARK_SINGLE_LEVEL), the script iterates over each level,
|
|
191
|
+
# re-invoking itself for each one.
|
|
192
|
+
# Results from all levels are aggregated into a combined JSON for the benchmark writer.
|
|
193
|
+
# This supports Requirement 1.5: configurable concurrency levels per config.
|
|
194
|
+
if [ -n "${BENCHMARK_CONCURRENCY_LEVELS:-}" ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
|
|
195
|
+
# Normalize: strip brackets and spaces, convert to comma-separated
|
|
196
|
+
_NORMALIZED_LEVELS=$(echo "${BENCHMARK_CONCURRENCY_LEVELS}" | tr -d '[] ' )
|
|
197
|
+
|
|
198
|
+
# Skip if empty after normalization
|
|
199
|
+
if [ -n "${_NORMALIZED_LEVELS}" ]; then
|
|
200
|
+
echo "📊 Multi-level benchmark: running concurrency levels [${_NORMALIZED_LEVELS}]"
|
|
201
|
+
echo ""
|
|
202
|
+
|
|
203
|
+
IFS=',' read -ra _LEVELS <<< "${_NORMALIZED_LEVELS}"
|
|
204
|
+
_ALL_RESULTS_DIR="${SCRIPT_DIR}/../benchmarks/multi-level-$(date +%Y%m%d-%H%M%S)"
|
|
205
|
+
mkdir -p "${_ALL_RESULTS_DIR}"
|
|
206
|
+
_LEVEL_FAILURES=0
|
|
207
|
+
|
|
208
|
+
for _LEVEL in "${_LEVELS[@]}"; do
|
|
209
|
+
_LEVEL=$(echo "${_LEVEL}" | tr -d ' ')
|
|
210
|
+
# Skip non-numeric values
|
|
211
|
+
if ! [[ "${_LEVEL}" =~ ^[0-9]+$ ]]; then
|
|
212
|
+
echo "⚠️ Skipping invalid concurrency level: ${_LEVEL}"
|
|
213
|
+
continue
|
|
214
|
+
fi
|
|
215
|
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
216
|
+
echo " Running benchmark at concurrency level: ${_LEVEL}"
|
|
217
|
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
218
|
+
echo ""
|
|
219
|
+
|
|
220
|
+
# Re-invoke self with overridden concurrency and single-level flag
|
|
221
|
+
export BENCHMARK_CONCURRENCY="${_LEVEL}"
|
|
222
|
+
export _BENCHMARK_SINGLE_LEVEL=1
|
|
223
|
+
# Build argument list for re-invocation
|
|
224
|
+
_REINVOKE_ARGS="--force"
|
|
225
|
+
if [ "${CLEAN_AFTER}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --clean"; fi
|
|
226
|
+
if [ "${ARG_NO_STALE_WARNING}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --no-stale-warning"; fi
|
|
227
|
+
if [ -n "${ARG_WORKLOAD}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --workload ${ARG_WORKLOAD}"; fi
|
|
228
|
+
if [ -n "${IC_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --ic ${IC_ARG}"; fi
|
|
229
|
+
if [ -n "${ADAPTER_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --adapter ${ADAPTER_ARG}"; fi
|
|
230
|
+
|
|
231
|
+
if "${BASH_SOURCE[0]}" ${_REINVOKE_ARGS}; then
|
|
232
|
+
# Copy results to aggregation directory — find the child's results
|
|
233
|
+
# Try the marker file first (set by child), then fall back to ls -td
|
|
234
|
+
_LATEST_JOB_DIR=""
|
|
235
|
+
if [ -f "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" ]; then
|
|
236
|
+
_LATEST_JOB_DIR=$(cat "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null)
|
|
237
|
+
fi
|
|
238
|
+
if [ -z "${_LATEST_JOB_DIR}" ] || [ ! -d "${_LATEST_JOB_DIR}" ]; then
|
|
239
|
+
_LATEST_JOB_DIR=$(ls -td "${SCRIPT_DIR}/../benchmarks/${PROJECT_NAME}-benchmark-"* 2>/dev/null | head -1)
|
|
240
|
+
fi
|
|
241
|
+
if [ -n "${_LATEST_JOB_DIR}" ] && [ -d "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export.jsonl" ]; then
|
|
242
|
+
cp "${_LATEST_JOB_DIR}/output/profile_export.jsonl" "${_ALL_RESULTS_DIR}/profile-concurrency-${_LEVEL}.jsonl"
|
|
243
|
+
elif [ -n "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" ]; then
|
|
244
|
+
cp "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" "${_ALL_RESULTS_DIR}/results-concurrency-${_LEVEL}.json"
|
|
245
|
+
fi
|
|
246
|
+
else
|
|
247
|
+
echo "⚠️ Benchmark at concurrency ${_LEVEL} failed (non-fatal, continuing)"
|
|
248
|
+
_LEVEL_FAILURES=$((_LEVEL_FAILURES + 1))
|
|
249
|
+
fi
|
|
250
|
+
unset _BENCHMARK_SINGLE_LEVEL
|
|
251
|
+
echo ""
|
|
252
|
+
done
|
|
253
|
+
|
|
254
|
+
# Aggregate results into a combined JSON file for the benchmark writer
|
|
255
|
+
# Reads per-level JSONL files and computes aggregate metrics per concurrency level
|
|
256
|
+
echo "📊 Aggregating multi-level results..."
|
|
257
|
+
_COMBINED_FILE="${_ALL_RESULTS_DIR}/results.json"
|
|
258
|
+
python3 -c "
|
|
259
|
+
import json, glob, sys, os, math
|
|
260
|
+
|
|
261
|
+
def percentile(sorted_vals, pct):
|
|
262
|
+
if not sorted_vals:
|
|
263
|
+
return 0.0
|
|
264
|
+
idx = (pct / 100.0) * (len(sorted_vals) - 1)
|
|
265
|
+
lower = int(math.floor(idx))
|
|
266
|
+
upper = int(math.ceil(idx))
|
|
267
|
+
if lower == upper:
|
|
268
|
+
return sorted_vals[lower]
|
|
269
|
+
frac = idx - lower
|
|
270
|
+
return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac
|
|
271
|
+
|
|
272
|
+
def get_val(metrics, key):
|
|
273
|
+
m = metrics.get(key)
|
|
274
|
+
if isinstance(m, dict):
|
|
275
|
+
return m.get('value')
|
|
276
|
+
return m
|
|
277
|
+
|
|
278
|
+
results_dir = '${_ALL_RESULTS_DIR}'
|
|
279
|
+
combined = {'metrics': []}
|
|
280
|
+
|
|
281
|
+
# Process JSONL files (preferred)
|
|
282
|
+
for f in sorted(glob.glob(os.path.join(results_dir, 'profile-concurrency-*.jsonl'))):
|
|
283
|
+
try:
|
|
284
|
+
level = int(os.path.basename(f).replace('profile-concurrency-', '').replace('.jsonl', ''))
|
|
285
|
+
records = []
|
|
286
|
+
with open(f) as fp:
|
|
287
|
+
for line in fp:
|
|
288
|
+
line = line.strip()
|
|
289
|
+
if line:
|
|
290
|
+
records.append(json.loads(line))
|
|
291
|
+
|
|
292
|
+
if not records:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
# Aggregate per-request metrics
|
|
296
|
+
latencies, ttfts, itls, ttsts, out_tokens = [], [], [], [], []
|
|
297
|
+
start_times, end_times, in_tokens = [], [], []
|
|
298
|
+
prefill_tps, output_tps = [], []
|
|
299
|
+
|
|
300
|
+
for rec in records:
|
|
301
|
+
meta = rec.get('metadata', {})
|
|
302
|
+
metrics = rec.get('metrics', {})
|
|
303
|
+
lat = get_val(metrics, 'request_latency')
|
|
304
|
+
if lat is not None: latencies.append(lat)
|
|
305
|
+
ttft = get_val(metrics, 'time_to_first_token') or get_val(metrics, 'time_to_first_output_token')
|
|
306
|
+
if ttft is not None: ttfts.append(ttft)
|
|
307
|
+
itl = get_val(metrics, 'inter_token_latency')
|
|
308
|
+
if itl is not None: itls.append(itl)
|
|
309
|
+
ttst = get_val(metrics, 'time_to_second_token')
|
|
310
|
+
if ttst is not None: ttsts.append(ttst)
|
|
311
|
+
otc = get_val(metrics, 'output_token_count')
|
|
312
|
+
if otc is not None: out_tokens.append(otc)
|
|
313
|
+
isl = get_val(metrics, 'input_sequence_length')
|
|
314
|
+
if isl is not None: in_tokens.append(isl)
|
|
315
|
+
ptps = get_val(metrics, 'prefill_throughput_per_user')
|
|
316
|
+
if ptps is not None: prefill_tps.append(ptps)
|
|
317
|
+
otps = get_val(metrics, 'output_token_throughput_per_user')
|
|
318
|
+
if otps is not None: output_tps.append(otps)
|
|
319
|
+
rs = meta.get('request_start_ns')
|
|
320
|
+
re_ = meta.get('request_end_ns')
|
|
321
|
+
if rs: start_times.append(rs)
|
|
322
|
+
if re_: end_times.append(re_)
|
|
323
|
+
|
|
324
|
+
# Sort for percentiles
|
|
325
|
+
latencies.sort()
|
|
326
|
+
ttfts.sort()
|
|
327
|
+
itls.sort()
|
|
328
|
+
ttsts.sort()
|
|
329
|
+
prefill_tps.sort()
|
|
330
|
+
output_tps.sort()
|
|
331
|
+
|
|
332
|
+
# Compute throughput
|
|
333
|
+
duration_s = (max(end_times) - min(start_times)) / 1e9 if start_times and end_times else 1.0
|
|
334
|
+
duration_s = max(duration_s, 0.001)
|
|
335
|
+
req_throughput = len(records) / duration_s
|
|
336
|
+
token_throughput = sum(out_tokens) / duration_s if out_tokens else 0.0
|
|
337
|
+
|
|
338
|
+
entry = {
|
|
339
|
+
'concurrency': level,
|
|
340
|
+
'request_throughput': req_throughput,
|
|
341
|
+
'output_token_throughput': token_throughput,
|
|
342
|
+
'total_requests': len(records),
|
|
343
|
+
'duration_seconds': duration_s,
|
|
344
|
+
'time_to_first_token': {
|
|
345
|
+
'avg': sum(ttfts)/len(ttfts) if ttfts else 0.0,
|
|
346
|
+
'p50': percentile(ttfts, 50),
|
|
347
|
+
'p90': percentile(ttfts, 90),
|
|
348
|
+
'p99': percentile(ttfts, 99),
|
|
349
|
+
},
|
|
350
|
+
'inter_token_latency': {
|
|
351
|
+
'avg': sum(itls)/len(itls) if itls else 0.0,
|
|
352
|
+
'p50': percentile(itls, 50),
|
|
353
|
+
'p90': percentile(itls, 90),
|
|
354
|
+
'p99': percentile(itls, 99),
|
|
355
|
+
},
|
|
356
|
+
'e2e_latency': {
|
|
357
|
+
'avg': sum(latencies)/len(latencies) if latencies else 0.0,
|
|
358
|
+
'p50': percentile(latencies, 50),
|
|
359
|
+
'p90': percentile(latencies, 90),
|
|
360
|
+
'p99': percentile(latencies, 99),
|
|
361
|
+
},
|
|
362
|
+
'time_to_second_token': {
|
|
363
|
+
'p50': percentile(ttsts, 50),
|
|
364
|
+
'p90': percentile(ttsts, 90),
|
|
365
|
+
},
|
|
366
|
+
'prefill_throughput': {
|
|
367
|
+
'avg': sum(prefill_tps)/len(prefill_tps) if prefill_tps else 0.0,
|
|
368
|
+
'p50': percentile(prefill_tps, 50),
|
|
369
|
+
},
|
|
370
|
+
'output_token_throughput_detail': {
|
|
371
|
+
'avg': sum(output_tps)/len(output_tps) if output_tps else 0.0,
|
|
372
|
+
'p50': percentile(output_tps, 50),
|
|
373
|
+
'p90': percentile(output_tps, 90),
|
|
374
|
+
},
|
|
375
|
+
'total_token_throughput': (sum(out_tokens) + sum(in_tokens)) / duration_s if (out_tokens or in_tokens) else 0.0,
|
|
376
|
+
'output_sequence_length': sum(out_tokens)/len(out_tokens) if out_tokens else 0.0,
|
|
377
|
+
'input_sequence_length': sum(in_tokens)/len(in_tokens) if in_tokens else 0.0,
|
|
378
|
+
'request_count': len(records),
|
|
379
|
+
'input_tokens_mean': ${BENCHMARK_INPUT_TOKENS_MEAN:-0},
|
|
380
|
+
'output_tokens_mean': ${BENCHMARK_OUTPUT_TOKENS_MEAN:-0},
|
|
381
|
+
}
|
|
382
|
+
combined['metrics'].append(entry)
|
|
383
|
+
except Exception as e:
|
|
384
|
+
print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)
|
|
385
|
+
|
|
386
|
+
# Fallback: process old-style JSON files if no JSONL found
|
|
387
|
+
if not combined['metrics']:
|
|
388
|
+
for f in sorted(glob.glob(os.path.join(results_dir, 'results-concurrency-*.json'))):
|
|
389
|
+
try:
|
|
390
|
+
with open(f) as fp:
|
|
391
|
+
data = json.load(fp)
|
|
392
|
+
level = int(os.path.basename(f).replace('results-concurrency-', '').replace('.json', ''))
|
|
393
|
+
if isinstance(data, dict):
|
|
394
|
+
data['concurrency'] = level
|
|
395
|
+
combined['metrics'].append(data)
|
|
396
|
+
except Exception as e:
|
|
397
|
+
print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)
|
|
398
|
+
|
|
399
|
+
with open('${_COMBINED_FILE}', 'w') as fp:
|
|
400
|
+
try:
|
|
401
|
+
json.dump(combined, fp, indent=2)
|
|
402
|
+
except TypeError as te:
|
|
403
|
+
print(f'Warning: JSON serialize error: {str(te)}', file=sys.stderr)
|
|
404
|
+
fp.write(json.dumps({'metrics': []}, indent=2))
|
|
405
|
+
n_metrics = len(combined.get('metrics', []))
|
|
406
|
+
print(f'Combined {n_metrics} concurrency level results')
|
|
407
|
+
" 2>&1
|
|
408
|
+
|
|
409
|
+
# Persist to Athena if CI mode is active
|
|
410
|
+
if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ -f "${_COMBINED_FILE}" ]; then
|
|
411
|
+
echo ""
|
|
412
|
+
echo "📊 Persisting multi-level benchmark results to Athena..."
|
|
413
|
+
|
|
414
|
+
_compute_config_id() {
|
|
415
|
+
local input="${DEPLOYMENT_CONFIG}:${MODEL_NAME:-none}:${INSTANCE_TYPE}:${AWS_REGION}:${DEPLOYMENT_TARGET}:ic${IC_COUNT:-1}:adapt${ADAPTER_COUNT:-0}"
|
|
416
|
+
if command -v sha256sum &> /dev/null; then
|
|
417
|
+
echo -n "$input" | sha256sum | cut -c1-16
|
|
418
|
+
else
|
|
419
|
+
echo -n "$input" | shasum -a 256 | cut -c1-16
|
|
420
|
+
fi
|
|
421
|
+
}
|
|
422
|
+
CONFIG_ID=$(_compute_config_id)
|
|
423
|
+
|
|
424
|
+
if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
|
|
425
|
+
--results-file "${_COMBINED_FILE}" \
|
|
426
|
+
--config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
|
|
427
|
+
--project-name "${PROJECT_NAME}" \
|
|
428
|
+
--workload "${BENCHMARK_WORKLOAD:-manual}" \
|
|
429
|
+
--bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
|
|
430
|
+
--region "${AWS_REGION:-${REGION}}"; then
|
|
431
|
+
echo "✅ Multi-level benchmark results persisted to S3"
|
|
432
|
+
else
|
|
433
|
+
echo "⚠️ Failed to persist multi-level benchmark results to Athena (non-fatal)"
|
|
434
|
+
fi
|
|
435
|
+
fi
|
|
436
|
+
|
|
437
|
+
echo ""
|
|
438
|
+
echo "📋 Multi-level Summary:"
|
|
439
|
+
echo " Levels tested: ${_NORMALIZED_LEVELS}"
|
|
440
|
+
echo " Failures: ${_LEVEL_FAILURES} / ${#_LEVELS[@]}"
|
|
441
|
+
echo " Results: ${_ALL_RESULTS_DIR}/"
|
|
442
|
+
|
|
443
|
+
if [ ${_LEVEL_FAILURES} -ge ${#_LEVELS[@]} ]; then
|
|
444
|
+
echo "❌ All concurrency levels failed"
|
|
445
|
+
exit 1
|
|
446
|
+
fi
|
|
447
|
+
exit 0
|
|
448
|
+
fi
|
|
449
|
+
fi
|
|
450
|
+
|
|
451
|
+
# ── _check_schema_registry_staleness() ────────────────────────────────────────
|
|
452
|
+
# Warn if the schema registry manifest's lastSynced timestamp is older than threshold.
|
|
453
|
+
# Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
|
|
454
|
+
# Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
|
|
455
|
+
_check_schema_registry_staleness() {
|
|
456
|
+
if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
|
|
457
|
+
return 0
|
|
458
|
+
fi
|
|
459
|
+
local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
|
|
460
|
+
local manifest_file="${HOME}/.ml-container-creator/schemas/manifest.json"
|
|
461
|
+
if [ ! -f "${manifest_file}" ]; then
|
|
462
|
+
return 0
|
|
463
|
+
fi
|
|
464
|
+
local last_synced
|
|
465
|
+
last_synced=$(python3 -c "
|
|
466
|
+
import json, sys
|
|
467
|
+
from datetime import datetime, timezone
|
|
468
|
+
try:
|
|
469
|
+
with open('${manifest_file}') as f:
|
|
470
|
+
manifest = json.load(f)
|
|
471
|
+
ls = manifest.get('lastSynced', '')
|
|
472
|
+
if not ls:
|
|
473
|
+
sys.exit(0)
|
|
474
|
+
synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
|
|
475
|
+
days = (datetime.now(timezone.utc) - synced).days
|
|
476
|
+
if days > int('${threshold}'):
|
|
477
|
+
print(days)
|
|
478
|
+
except:
|
|
479
|
+
pass
|
|
480
|
+
" 2>/dev/null)
|
|
481
|
+
if [ -n "${last_synced}" ]; then
|
|
482
|
+
echo "⚠️ Schema registry is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-schemas' to update."
|
|
483
|
+
fi
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
_check_schema_registry_staleness
|
|
487
|
+
|
|
57
488
|
# ── Verify AWS CLI v2 ─────────────────────────────────────────────────────────
|
|
58
489
|
if ! aws --version 2>&1 | grep -q "aws-cli/2"; then
|
|
59
490
|
echo "❌ AWS CLI v2 is required for benchmarking."
|
|
@@ -185,7 +616,7 @@ if [ "${FORCE}" = false ] && [ -n "${BENCHMARK_JOB_NAME:-}" ]; then
|
|
|
185
616
|
fi
|
|
186
617
|
|
|
187
618
|
# ── Configuration ─────────────────────────────────────────────────────────────
|
|
188
|
-
WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config"
|
|
619
|
+
WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
|
|
189
620
|
if [ "${RESUME_EXISTING}" = false ]; then
|
|
190
621
|
BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
|
|
191
622
|
fi
|
|
@@ -357,6 +788,7 @@ if [ -n "${EXISTING_CONFIG_SPEC}" ]; then
|
|
|
357
788
|
|
|
358
789
|
if [ "${EXISTING_NORMALIZED}" = "${DESIRED_NORMALIZED}" ]; then
|
|
359
790
|
echo " ✅ Existing workload config matches current parameters — reusing"
|
|
791
|
+
CREATE_WORKLOAD_CONFIG=false
|
|
360
792
|
else
|
|
361
793
|
echo " ⚠️ Workload config parameters changed — recreating..."
|
|
362
794
|
aws sagemaker delete-ai-workload-config \
|
|
@@ -484,10 +916,11 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
|
|
|
484
916
|
# Persist results locally to benchmarks/<job-name>/
|
|
485
917
|
PROJECT_ROOT="${SCRIPT_DIR}/.."
|
|
486
918
|
LOCAL_RESULTS_DIR="${PROJECT_ROOT}/benchmarks/${BENCHMARK_JOB_NAME}"
|
|
487
|
-
|
|
919
|
+
RESULTS_JSONL="${LOCAL_RESULTS_DIR}/output/profile_export.jsonl"
|
|
920
|
+
RESULTS_FILE="${LOCAL_RESULTS_DIR}/output/profile_export_aiperf.json"
|
|
488
921
|
|
|
489
922
|
# Check if results already exist locally (idempotency: skip S3 download)
|
|
490
|
-
if [ -f "${RESULTS_FILE}" ]; then
|
|
923
|
+
if [ -f "${RESULTS_JSONL}" ] || [ -f "${RESULTS_FILE}" ]; then
|
|
491
924
|
echo "📥 Step 4: Results already available locally"
|
|
492
925
|
RESULTS_DOWNLOADED=true
|
|
493
926
|
else
|
|
@@ -513,12 +946,27 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
|
|
|
513
946
|
# This is the most reliable approach — handles any subdirectory structure
|
|
514
947
|
echo " Syncing results from S3..."
|
|
515
948
|
if aws s3 sync "${RESULTS_S3_PATH}" "${LOCAL_RESULTS_DIR}/" --region "${AWS_REGION}" 2>/dev/null; then
|
|
516
|
-
#
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
949
|
+
# Extract any tar.gz archives (benchmark service packages results as output.tar.gz)
|
|
950
|
+
for ARCHIVE in $(find "${LOCAL_RESULTS_DIR}" -name "*.tar.gz" -type f 2>/dev/null); do
|
|
951
|
+
ARCHIVE_DIR=$(dirname "${ARCHIVE}")
|
|
952
|
+
tar -xzf "${ARCHIVE}" -C "${ARCHIVE_DIR}" 2>/dev/null || true
|
|
953
|
+
done
|
|
954
|
+
|
|
955
|
+
# Look for specific result files (priority: JSONL > aiperf JSON)
|
|
956
|
+
_FOUND_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
|
|
957
|
+
_FOUND_JSON=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export_aiperf.json" -type f 2>/dev/null | head -1)
|
|
958
|
+
|
|
959
|
+
if [ -n "${_FOUND_JSONL}" ]; then
|
|
960
|
+
if [ "${_FOUND_JSONL}" != "${RESULTS_JSONL}" ]; then
|
|
961
|
+
mkdir -p "$(dirname "${RESULTS_JSONL}")"
|
|
962
|
+
cp "${_FOUND_JSONL}" "${RESULTS_JSONL}"
|
|
963
|
+
fi
|
|
964
|
+
RESULTS_DOWNLOADED=true
|
|
965
|
+
fi
|
|
966
|
+
if [ -n "${_FOUND_JSON}" ]; then
|
|
967
|
+
if [ "${_FOUND_JSON}" != "${RESULTS_FILE}" ]; then
|
|
968
|
+
mkdir -p "$(dirname "${RESULTS_FILE}")"
|
|
969
|
+
cp "${_FOUND_JSON}" "${RESULTS_FILE}"
|
|
522
970
|
fi
|
|
523
971
|
RESULTS_DOWNLOADED=true
|
|
524
972
|
fi
|
|
@@ -531,33 +979,25 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
|
|
|
531
979
|
RESULTS_BUCKET=$(echo "${RESULTS_S3_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
|
|
532
980
|
RESULTS_PREFIX=$(echo "${RESULTS_S3_PATH}" | sed "s|s3://${RESULTS_BUCKET}/||")
|
|
533
981
|
|
|
534
|
-
# List all objects
|
|
535
|
-
|
|
536
|
-
FOUND_KEY=$(aws s3api list-objects-v2 \
|
|
982
|
+
# List all objects and look for our target files
|
|
983
|
+
_ALL_KEYS=$(aws s3api list-objects-v2 \
|
|
537
984
|
--bucket "${RESULTS_BUCKET}" \
|
|
538
985
|
--prefix "${RESULTS_PREFIX}" \
|
|
539
986
|
--region "${AWS_REGION}" \
|
|
540
987
|
--query 'Contents[].Key' \
|
|
541
|
-
--output text 2>/dev/null \
|
|
542
|
-
| tr '\t' '\n' \
|
|
543
|
-
| grep -E '\.(json|jsonl|csv)$' \
|
|
544
|
-
| head -1)
|
|
545
|
-
|
|
546
|
-
if [ -n "${FOUND_KEY}" ] && [ "${FOUND_KEY}" != "None" ]; then
|
|
547
|
-
if aws s3 cp "s3://${RESULTS_BUCKET}/${FOUND_KEY}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
548
|
-
RESULTS_DOWNLOADED=true
|
|
549
|
-
fi
|
|
550
|
-
fi
|
|
551
|
-
fi
|
|
988
|
+
--output text 2>/dev/null | tr '\t' '\n')
|
|
552
989
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
990
|
+
_JSONL_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export\.jsonl$" | head -1)
|
|
991
|
+
_JSON_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export_aiperf\.json$" | head -1)
|
|
992
|
+
|
|
993
|
+
if [ -n "${_JSONL_KEY}" ] && [ "${_JSONL_KEY}" != "None" ]; then
|
|
994
|
+
mkdir -p "$(dirname "${RESULTS_JSONL}")"
|
|
995
|
+
aws s3 cp "s3://${RESULTS_BUCKET}/${_JSONL_KEY}" "${RESULTS_JSONL}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
|
|
996
|
+
fi
|
|
997
|
+
if [ -n "${_JSON_KEY}" ] && [ "${_JSON_KEY}" != "None" ]; then
|
|
998
|
+
mkdir -p "$(dirname "${RESULTS_FILE}")"
|
|
999
|
+
aws s3 cp "s3://${RESULTS_BUCKET}/${_JSON_KEY}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
|
|
1000
|
+
fi
|
|
561
1001
|
fi
|
|
562
1002
|
fi
|
|
563
1003
|
|
|
@@ -573,72 +1013,156 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
|
|
|
573
1013
|
echo "║ Endpoint: ${ENDPOINT_NAME}"
|
|
574
1014
|
echo "╠══════════════════════════════════════════════════════════════════╣"
|
|
575
1015
|
|
|
576
|
-
# Parse and display metrics
|
|
577
|
-
# Extract key metrics from the results JSON
|
|
1016
|
+
# Parse and display metrics from profile_export.jsonl (rich per-request data)
|
|
578
1017
|
if command -v python3 &>/dev/null; then
|
|
579
1018
|
python3 -c "
|
|
580
|
-
import json, sys
|
|
1019
|
+
import json, sys, os, math
|
|
1020
|
+
|
|
1021
|
+
def percentile(sorted_vals, pct):
|
|
1022
|
+
if not sorted_vals:
|
|
1023
|
+
return None
|
|
1024
|
+
idx = (pct / 100.0) * (len(sorted_vals) - 1)
|
|
1025
|
+
lower = int(math.floor(idx))
|
|
1026
|
+
upper = int(math.ceil(idx))
|
|
1027
|
+
if lower == upper:
|
|
1028
|
+
return sorted_vals[lower]
|
|
1029
|
+
frac = idx - lower
|
|
1030
|
+
return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac
|
|
1031
|
+
|
|
1032
|
+
def fmt(val, suffix=''):
|
|
1033
|
+
if val is None:
|
|
1034
|
+
return 'N/A'
|
|
1035
|
+
return f'{val:.2f}{suffix}'
|
|
581
1036
|
|
|
582
1037
|
try:
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
1038
|
+
jsonl_path = '${RESULTS_JSONL}'
|
|
1039
|
+
json_path = '${RESULTS_FILE}'
|
|
1040
|
+
records = []
|
|
1041
|
+
|
|
1042
|
+
# Primary: read profile_export.jsonl (rich per-request data)
|
|
1043
|
+
if os.path.exists(jsonl_path):
|
|
1044
|
+
with open(jsonl_path) as f:
|
|
1045
|
+
for line in f:
|
|
1046
|
+
line = line.strip()
|
|
1047
|
+
if line:
|
|
1048
|
+
try:
|
|
1049
|
+
records.append(json.loads(line))
|
|
1050
|
+
except json.JSONDecodeError:
|
|
1051
|
+
continue
|
|
1052
|
+
|
|
1053
|
+
if records:
|
|
1054
|
+
# Extract scalar values from metric dicts {"value": X, "unit": "..."}
|
|
1055
|
+
def get_val(metrics, key):
|
|
1056
|
+
m = metrics.get(key)
|
|
1057
|
+
if isinstance(m, dict):
|
|
1058
|
+
return m.get('value')
|
|
1059
|
+
return m
|
|
1060
|
+
|
|
1061
|
+
# Collect per-request metrics
|
|
1062
|
+
latencies = []
|
|
1063
|
+
ttfts = []
|
|
1064
|
+
itls = []
|
|
1065
|
+
ttsts = []
|
|
1066
|
+
output_tokens = []
|
|
1067
|
+
start_times = []
|
|
1068
|
+
end_times = []
|
|
1069
|
+
|
|
1070
|
+
for rec in records:
|
|
1071
|
+
meta = rec.get('metadata', {})
|
|
1072
|
+
metrics = rec.get('metrics', {})
|
|
1073
|
+
|
|
1074
|
+
lat = get_val(metrics, 'request_latency')
|
|
1075
|
+
if lat is not None:
|
|
1076
|
+
latencies.append(lat)
|
|
1077
|
+
|
|
1078
|
+
ttft = get_val(metrics, 'time_to_first_token')
|
|
1079
|
+
if ttft is None:
|
|
1080
|
+
ttft = get_val(metrics, 'time_to_first_output_token')
|
|
1081
|
+
if ttft is not None:
|
|
1082
|
+
ttfts.append(ttft)
|
|
1083
|
+
|
|
1084
|
+
itl = get_val(metrics, 'inter_token_latency')
|
|
1085
|
+
if itl is not None:
|
|
1086
|
+
itls.append(itl)
|
|
1087
|
+
|
|
1088
|
+
ttst = get_val(metrics, 'time_to_second_token')
|
|
1089
|
+
if ttst is not None:
|
|
1090
|
+
ttsts.append(ttst)
|
|
1091
|
+
|
|
1092
|
+
otc = get_val(metrics, 'output_token_count')
|
|
1093
|
+
if otc is not None:
|
|
1094
|
+
output_tokens.append(otc)
|
|
1095
|
+
|
|
1096
|
+
# Track timing for throughput calculation
|
|
1097
|
+
rs = meta.get('request_start_ns')
|
|
1098
|
+
re_ = meta.get('request_end_ns')
|
|
1099
|
+
if rs is not None:
|
|
1100
|
+
start_times.append(rs)
|
|
1101
|
+
if re_ is not None:
|
|
1102
|
+
end_times.append(re_)
|
|
1103
|
+
|
|
1104
|
+
n = len(records)
|
|
1105
|
+
|
|
1106
|
+
# Compute system throughput
|
|
1107
|
+
if start_times and end_times:
|
|
1108
|
+
duration_ns = max(end_times) - min(start_times)
|
|
1109
|
+
duration_s = duration_ns / 1e9 if duration_ns > 0 else 1.0
|
|
1110
|
+
req_throughput = n / duration_s
|
|
1111
|
+
total_out_tokens = sum(output_tokens) if output_tokens else 0
|
|
1112
|
+
token_throughput = total_out_tokens / duration_s
|
|
1113
|
+
else:
|
|
1114
|
+
req_throughput = None
|
|
1115
|
+
token_throughput = None
|
|
1116
|
+
|
|
1117
|
+
# Compute percentiles
|
|
1118
|
+
latencies.sort()
|
|
1119
|
+
ttfts.sort()
|
|
1120
|
+
itls.sort()
|
|
1121
|
+
ttsts.sort()
|
|
1122
|
+
|
|
1123
|
+
print(f'║ Requests: {n}')
|
|
1124
|
+
print(f'║ Request Throughput: {fmt(req_throughput)} req/s')
|
|
1125
|
+
print(f'║ Output Token Throughput: {fmt(token_throughput)} tokens/s')
|
|
1126
|
+
print('║')
|
|
1127
|
+
print('║ Time to First Token (ms):')
|
|
1128
|
+
print(f'║ Avg: {fmt(sum(ttfts)/len(ttfts) if ttfts else None)} P50: {fmt(percentile(ttfts, 50))} P90: {fmt(percentile(ttfts, 90))} P99: {fmt(percentile(ttfts, 99))}')
|
|
1129
|
+
print('║')
|
|
1130
|
+
print('║ Inter-Token Latency (ms):')
|
|
1131
|
+
print(f'║ Avg: {fmt(sum(itls)/len(itls) if itls else None)} P50: {fmt(percentile(itls, 50))} P90: {fmt(percentile(itls, 90))} P99: {fmt(percentile(itls, 99))}')
|
|
1132
|
+
print('║')
|
|
1133
|
+
print('║ Request Latency (ms):')
|
|
1134
|
+
print(f'║ Avg: {fmt(sum(latencies)/len(latencies) if latencies else None)} P50: {fmt(percentile(latencies, 50))} P90: {fmt(percentile(latencies, 90))} P99: {fmt(percentile(latencies, 99))}')
|
|
1135
|
+
print('║')
|
|
1136
|
+
print('║ Time to Second Token (ms):')
|
|
1137
|
+
print(f'║ Avg: {fmt(sum(ttsts)/len(ttsts) if ttsts else None)} P50: {fmt(percentile(ttsts, 50))} P90: {fmt(percentile(ttsts, 90))} P99: {fmt(percentile(ttsts, 99))}')
|
|
1138
|
+
|
|
1139
|
+
else:
|
|
1140
|
+
print('║ ⚠️ No JSONL results found — cannot display metrics')
|
|
1141
|
+
print(f'║ Expected: {jsonl_path}')
|
|
626
1142
|
|
|
627
1143
|
except Exception as e:
|
|
628
1144
|
print(f'║ ⚠️ Could not parse results: {e}')
|
|
629
|
-
|
|
1145
|
+
import traceback
|
|
1146
|
+
traceback.print_exc(file=sys.stderr)
|
|
630
1147
|
"
|
|
631
1148
|
else
|
|
632
1149
|
# Fallback: display raw JSON if python3 is not available
|
|
633
1150
|
echo "║ (python3 not available — showing raw results)"
|
|
634
1151
|
echo "║"
|
|
635
|
-
|
|
1152
|
+
if [ -f "${RESULTS_JSONL}" ]; then
|
|
1153
|
+
head -3 "${RESULTS_JSONL}"
|
|
1154
|
+
elif [ -f "${RESULTS_FILE}" ]; then
|
|
1155
|
+
cat "${RESULTS_FILE}" | head -50
|
|
1156
|
+
fi
|
|
636
1157
|
fi
|
|
637
1158
|
|
|
638
1159
|
echo "╚══════════════════════════════════════════════════════════════════╝"
|
|
639
1160
|
echo ""
|
|
640
1161
|
echo "📁 Results saved to: benchmarks/${BENCHMARK_JOB_NAME}/"
|
|
641
1162
|
echo "☁️ S3 results: ${RESULTS_S3_PATH:-${BENCHMARK_S3_OUTPUT_PATH}}"
|
|
1163
|
+
|
|
1164
|
+
# Write marker for multi-level parent to find this results dir
|
|
1165
|
+
echo "${LOCAL_RESULTS_DIR}" > "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null || true
|
|
642
1166
|
else
|
|
643
1167
|
echo "⚠️ Could not download results from S3"
|
|
644
1168
|
echo " The benchmark completed but results could not be located."
|
|
@@ -661,6 +1185,36 @@ except Exception as e:
|
|
|
661
1185
|
--output table 2>/dev/null || echo " (could not list objects)"
|
|
662
1186
|
fi
|
|
663
1187
|
|
|
1188
|
+
# ── Persist benchmark results to Athena ──────────────────────────────────
|
|
1189
|
+
# When CI_BENCHMARK_RESULTS_BUCKET is set (from bootstrap config), call the
|
|
1190
|
+
# benchmark writer to persist results as Parquet to S3 for Athena querying.
|
|
1191
|
+
# Skip when running as a child of multi-level mode — the parent orchestrator
|
|
1192
|
+
# handles combined persistence (one row per concurrency level, no duplicates).
|
|
1193
|
+
if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ "${RESULTS_DOWNLOADED}" = true ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
|
|
1194
|
+
echo ""
|
|
1195
|
+
echo "📊 Persisting benchmark results to Athena..."
|
|
1196
|
+
|
|
1197
|
+
# Determine which results file to pass to the writer (prefer JSONL)
|
|
1198
|
+
_WRITER_INPUT="${RESULTS_JSONL}"
|
|
1199
|
+
if [ ! -f "${_WRITER_INPUT}" ]; then
|
|
1200
|
+
_WRITER_INPUT="${RESULTS_FILE}"
|
|
1201
|
+
fi
|
|
1202
|
+
|
|
1203
|
+
# Best-effort: errors are logged but do not fail the benchmark script
|
|
1204
|
+
if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
|
|
1205
|
+
--results-file "${_WRITER_INPUT}" \
|
|
1206
|
+
--config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
|
|
1207
|
+
--project-name "${PROJECT_NAME}" \
|
|
1208
|
+
--workload "${BENCHMARK_WORKLOAD:-manual}" \
|
|
1209
|
+
--concurrency "${BENCHMARK_CONCURRENCY}" \
|
|
1210
|
+
--bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
|
|
1211
|
+
--region "${AWS_REGION:-${REGION}}"; then
|
|
1212
|
+
echo "✅ Benchmark results persisted to S3"
|
|
1213
|
+
else
|
|
1214
|
+
echo "⚠️ Failed to persist benchmark results to Athena (non-fatal)"
|
|
1215
|
+
echo " Results remain available locally in: benchmarks/${BENCHMARK_JOB_NAME}/"
|
|
1216
|
+
fi
|
|
1217
|
+
fi
|
|
664
1218
|
elif [ "${JOB_STATUS}" = "Failed" ]; then
|
|
665
1219
|
# Display failure reason
|
|
666
1220
|
echo "❌ Step 4: Benchmark job failed"
|