@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +33 -22
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -67
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +166 -153
  23. package/servers/instance-sizer/lib/instance-ranker.js +120 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/instances.json +27 -0
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/custom-validators.js +13 -13
  31. package/servers/lib/dynamic-resolver.js +4 -4
  32. package/servers/marketplace-picker/index.js +342 -0
  33. package/servers/marketplace-picker/manifest.json +14 -0
  34. package/servers/marketplace-picker/package.json +18 -0
  35. package/servers/model-picker/index.js +382 -382
  36. package/servers/region-picker/index.js +56 -56
  37. package/servers/workload-picker/LICENSE +202 -0
  38. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  39. package/servers/workload-picker/index.js +171 -0
  40. package/servers/workload-picker/manifest.json +16 -0
  41. package/servers/workload-picker/package.json +16 -0
  42. package/src/app.js +12 -3
  43. package/src/lib/bootstrap-command-handler.js +609 -15
  44. package/src/lib/bootstrap-config.js +36 -0
  45. package/src/lib/bootstrap-profile-manager.js +48 -41
  46. package/src/lib/ci-register-helpers.js +74 -0
  47. package/src/lib/config-loader.js +3 -0
  48. package/src/lib/config-manager.js +7 -0
  49. package/src/lib/config-validator.js +1 -1
  50. package/src/lib/cuda-resolver.js +17 -8
  51. package/src/lib/generated/cli-options.js +319 -314
  52. package/src/lib/generated/parameter-matrix.js +672 -661
  53. package/src/lib/generated/validation-rules.js +76 -72
  54. package/src/lib/path-prover-brain.js +664 -0
  55. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  56. package/src/lib/prompts/model-prompts.js +6 -0
  57. package/src/lib/prompts/project-prompts.js +12 -0
  58. package/src/lib/secrets-prompt-runner.js +4 -0
  59. package/src/lib/template-manager.js +1 -1
  60. package/src/lib/template-variable-resolver.js +87 -1
  61. package/src/lib/tune-catalog-validator.js +37 -4
  62. package/templates/Dockerfile +9 -0
  63. package/templates/code/adapter_sidecar.py +444 -0
  64. package/templates/code/serve +6 -0
  65. package/templates/code/serve.d/vllm.ejs +1 -1
  66. package/templates/do/.benchmark_writer.py +1476 -0
  67. package/templates/do/.tune_helper.py +982 -57
  68. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  69. package/templates/do/adapter +154 -0
  70. package/templates/do/benchmark +639 -85
  71. package/templates/do/build +5 -0
  72. package/templates/do/clean.d/async-inference.ejs +5 -0
  73. package/templates/do/clean.d/batch-transform.ejs +5 -0
  74. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  75. package/templates/do/clean.d/managed-inference.ejs +5 -0
  76. package/templates/do/config +115 -45
  77. package/templates/do/deploy.d/async-inference.ejs +30 -3
  78. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  79. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  80. package/templates/do/deploy.d/managed-inference.ejs +216 -14
  81. package/templates/do/lib/endpoint-config.sh +1 -1
  82. package/templates/do/lib/profile.sh +44 -0
  83. package/templates/do/optimize +106 -37
  84. package/templates/do/push +5 -0
  85. package/templates/do/register +94 -0
  86. package/templates/do/stage +567 -0
  87. package/templates/do/submit +7 -0
  88. package/templates/do/test +14 -0
  89. package/templates/do/tune +382 -59
  90. package/templates/do/validate +44 -4
@@ -1,4 +1,3 @@
1
- #!/bin/bash
2
1
  # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
2
  # SPDX-License-Identifier: Apache-2.0
4
3
 
@@ -19,22 +18,27 @@ CLEAN_AFTER=false
19
18
  FORCE=false
20
19
  IC_ARG=""
21
20
  ADAPTER_ARG=""
21
+ ARG_NO_STALE_WARNING=false
22
+ ARG_WORKLOAD=""
22
23
  while [ $# -gt 0 ]; do
23
24
  case "$1" in
24
25
  --clean) CLEAN_AFTER=true; shift ;;
25
26
  --force) FORCE=true; shift ;;
27
+ --no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
28
+ --workload) shift; ARG_WORKLOAD="${1:-}"; shift ;;
26
29
  --ic) shift; IC_ARG="${1:-}"; shift ;;
27
30
  --adapter) shift; ADAPTER_ARG="${1:-}"; shift ;;
28
31
  --help|-h)
29
- echo "Usage: ./do/benchmark [--ic <name>] [--adapter <name>] [--force] [--clean]"
32
+ echo "Usage: ./do/benchmark [--workload <name>] [--ic <name>] [--adapter <name>] [--force] [--clean] [--no-stale-warning]"
30
33
  echo ""
31
34
  echo "Run SageMaker AI Benchmark against the deployed endpoint."
32
35
  echo ""
33
36
  echo "Options:"
34
- echo " --ic <name> Benchmark a specific inference component"
35
- echo " --adapter <name> Benchmark a specific LoRA adapter IC"
36
- echo " --force Create a new benchmark job even if one is already running"
37
- echo " --clean Delete workload config and benchmark job after displaying results"
37
+ echo " --ic <name> Benchmark a specific inference component"
38
+ echo " --adapter <name> Benchmark a specific LoRA adapter IC"
39
+ echo " --force Create a new benchmark job even if one is already running"
40
+ echo " --clean Delete workload config and benchmark job after displaying results"
41
+ echo " --no-stale-warning Suppress schema registry staleness warning"
38
42
  echo ""
39
43
  echo "IC resolution:"
40
44
  echo " --adapter <name> Use ADAPTER_IC_NAME from do/adapters/<name>.conf"
@@ -54,6 +58,433 @@ while [ $# -gt 0 ]; do
54
58
  esac
55
59
  done
56
60
 
61
+
62
+ # ── Require --workload flag ───────────────────────────────────────────────────
63
+ if [ -z "${ARG_WORKLOAD}" ]; then
64
+ echo "❌ --workload <name> is required"
65
+ echo ""
66
+ # List available workloads from the MCP catalog
67
+ _CATALOG_FOR_HELP=""
68
+ if command -v npm &>/dev/null; then
69
+ _NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
70
+ if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
71
+ _CATALOG_FOR_HELP="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
72
+ fi
73
+ fi
74
+ if [ -n "${_CATALOG_FOR_HELP}" ]; then
75
+ echo " Available workloads:"
76
+ python3 -c "
77
+ import json
78
+ with open('${_CATALOG_FOR_HELP}') as f:
79
+ catalog = json.load(f)
80
+ for name, wl in catalog.get('workloads', {}).items():
81
+ print(f' {name:30s} {wl.get("description", "")[:50]}')
82
+ " 2>/dev/null || echo " (could not read workload catalog)"
83
+ else
84
+ echo " Run 'ml-container-creator mcp init' to install workload profiles"
85
+ fi
86
+ echo ""
87
+ echo " Usage: ./do/benchmark --workload multi_turn_chat"
88
+ exit 1
89
+ fi
90
+
91
+ # ── Workload Resolution (from workload-picker MCP server catalog) ─────────────
92
+ # If --workload is passed with a named workload (not "manual"), resolve
93
+ # the workload parameters from the MCP server's catalog file. This overrides
94
+ # BENCHMARK_INPUT_TOKENS_MEAN, BENCHMARK_OUTPUT_TOKENS_MEAN, BENCHMARK_STREAMING,
95
+ # and BENCHMARK_CONCURRENCY_LEVELS from do/config.
96
+ BENCHMARK_WORKLOAD="${ARG_WORKLOAD:-manual}"
97
+
98
+ if [ "${BENCHMARK_WORKLOAD}" != "manual" ]; then
99
+ # Locate the workload catalog (npm global or local)
100
+ _WORKLOAD_CATALOG=""
101
+ if [ -f "$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
102
+ _WORKLOAD_CATALOG="$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
103
+ elif command -v npm &>/dev/null; then
104
+ _NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
105
+ if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
106
+ _WORKLOAD_CATALOG="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
107
+ fi
108
+ fi
109
+
110
+ if [ -n "${_WORKLOAD_CATALOG}" ]; then
111
+ _WL_PARAMS=$(python3 -c "
112
+ import json, sys
113
+ with open('${_WORKLOAD_CATALOG}') as f:
114
+ catalog = json.load(f)
115
+ wl = catalog.get('workloads', {}).get('${BENCHMARK_WORKLOAD}')
116
+ if wl:
117
+ print(json.dumps(wl))
118
+ else:
119
+ print('null')
120
+ " 2>/dev/null) || _WL_PARAMS="null"
121
+
122
+ if [ "${_WL_PARAMS}" != "null" ] && [ -n "${_WL_PARAMS}" ]; then
123
+ echo "📋 Workload profile: ${BENCHMARK_WORKLOAD}"
124
+ BENCHMARK_INPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['input_tokens_mean'])")
125
+ BENCHMARK_OUTPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['output_tokens_mean'])")
126
+ BENCHMARK_STREAMING=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(str(json.load(sys.stdin)['streaming']).lower())")
127
+ # Set concurrency levels for multi-level mode if not already overridden
128
+ if [ -z "${BENCHMARK_CONCURRENCY_LEVELS:-}" ]; then
129
+ BENCHMARK_CONCURRENCY_LEVELS=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(','.join(str(x) for x in json.load(sys.stdin)['concurrency_levels']))")
130
+ fi
131
+ # Also override single-level BENCHMARK_CONCURRENCY with first level from workload
132
+ BENCHMARK_CONCURRENCY=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['concurrency_levels'][0])")
133
+ echo " Input tokens: ${BENCHMARK_INPUT_TOKENS_MEAN}, Output tokens: ${BENCHMARK_OUTPUT_TOKENS_MEAN}"
134
+ echo " Streaming: ${BENCHMARK_STREAMING}, Concurrency: ${BENCHMARK_CONCURRENCY_LEVELS:-${BENCHMARK_CONCURRENCY}}"
135
+ echo ""
136
+ else
137
+ echo "⚠️ Unknown workload '${BENCHMARK_WORKLOAD}' — using do/config defaults"
138
+ fi
139
+ else
140
+ echo "⚠️ Workload catalog not found — using do/config defaults"
141
+ fi
142
+ fi
143
+
144
+ # ── Resolve profile-level values ──────────────────────────────────────────────
145
+ # Read S3 buckets and account info from the bootstrap profile
146
+ _PROFILE_JSON=""
147
+ if command -v python3 &>/dev/null; then
148
+ _PROFILE_JSON=$(python3 -c "
149
+ import json, os
150
+ config_path = os.path.expanduser('~/.ml-container-creator/config.json')
151
+ try:
152
+ with open(config_path) as f:
153
+ config = json.load(f)
154
+ profile = config['profiles'][config['activeProfile']]
155
+ print(json.dumps(profile))
156
+ except:
157
+ print('{}')
158
+ " 2>/dev/null) || _PROFILE_JSON="{}"
159
+ fi
160
+
161
+ # Extract benchmark-relevant profile values
162
+ BENCHMARK_S3_OUTPUT_PATH=$(echo "${_PROFILE_JSON}" | python3 -c "
163
+ import sys, json
164
+ p = json.load(sys.stdin)
165
+ bucket = p.get('benchmarkS3Bucket', '')
166
+ if not bucket:
167
+ acct = p.get('accountId', 'unknown')
168
+ region = p.get('awsRegion', 'us-east-1')
169
+ bucket = f'mlcc-benchmark-{acct}-{region}'
170
+ print(f's3://{bucket}/${PROJECT_NAME}/')
171
+ " 2>/dev/null) || BENCHMARK_S3_OUTPUT_PATH=""
172
+
173
+ CI_BENCHMARK_RESULTS_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ciBenchmarkResultsBucket', ''))" 2>/dev/null) || CI_BENCHMARK_RESULTS_BUCKET=""
174
+
175
+ # Derive job names at runtime (unique per invocation)
176
+ BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
177
+ BENCHMARK_WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
178
+
179
+ # Ensure benchmark params have defaults (in case workload catalog wasn't found)
180
+ BENCHMARK_CONCURRENCY=${BENCHMARK_CONCURRENCY:-10}
181
+ BENCHMARK_INPUT_TOKENS_MEAN=${BENCHMARK_INPUT_TOKENS_MEAN:-550}
182
+ BENCHMARK_OUTPUT_TOKENS_MEAN=${BENCHMARK_OUTPUT_TOKENS_MEAN:-150}
183
+ BENCHMARK_STREAMING=${BENCHMARK_STREAMING:-true}
184
+
185
+
186
+
187
+ # ── Multi-level concurrency support (CI Stage 2) ─────────────────────────────
188
+ # When BENCHMARK_CONCURRENCY_LEVELS is set (comma-separated integers, e.g. "1,4,8"
189
+ # or JSON array string, e.g. "[1,4,8]"), and we are NOT already in single-level
190
+ # execution mode (_BENCHMARK_SINGLE_LEVEL), the script iterates over each level,
191
+ # re-invoking itself for each one.
192
+ # Results from all levels are aggregated into a combined JSON for the benchmark writer.
193
+ # This supports Requirement 1.5: configurable concurrency levels per config.
194
+ if [ -n "${BENCHMARK_CONCURRENCY_LEVELS:-}" ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
195
+ # Normalize: strip brackets and spaces, convert to comma-separated
196
+ _NORMALIZED_LEVELS=$(echo "${BENCHMARK_CONCURRENCY_LEVELS}" | tr -d '[] ' )
197
+
198
+ # Skip if empty after normalization
199
+ if [ -n "${_NORMALIZED_LEVELS}" ]; then
200
+ echo "📊 Multi-level benchmark: running concurrency levels [${_NORMALIZED_LEVELS}]"
201
+ echo ""
202
+
203
+ IFS=',' read -ra _LEVELS <<< "${_NORMALIZED_LEVELS}"
204
+ _ALL_RESULTS_DIR="${SCRIPT_DIR}/../benchmarks/multi-level-$(date +%Y%m%d-%H%M%S)"
205
+ mkdir -p "${_ALL_RESULTS_DIR}"
206
+ _LEVEL_FAILURES=0
207
+
208
+ for _LEVEL in "${_LEVELS[@]}"; do
209
+ _LEVEL=$(echo "${_LEVEL}" | tr -d ' ')
210
+ # Skip non-numeric values
211
+ if ! [[ "${_LEVEL}" =~ ^[0-9]+$ ]]; then
212
+ echo "⚠️ Skipping invalid concurrency level: ${_LEVEL}"
213
+ continue
214
+ fi
215
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
216
+ echo " Running benchmark at concurrency level: ${_LEVEL}"
217
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
218
+ echo ""
219
+
220
+ # Re-invoke self with overridden concurrency and single-level flag
221
+ export BENCHMARK_CONCURRENCY="${_LEVEL}"
222
+ export _BENCHMARK_SINGLE_LEVEL=1
223
+ # Build argument list for re-invocation
224
+ _REINVOKE_ARGS="--force"
225
+ if [ "${CLEAN_AFTER}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --clean"; fi
226
+ if [ "${ARG_NO_STALE_WARNING}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --no-stale-warning"; fi
227
+ if [ -n "${ARG_WORKLOAD}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --workload ${ARG_WORKLOAD}"; fi
228
+ if [ -n "${IC_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --ic ${IC_ARG}"; fi
229
+ if [ -n "${ADAPTER_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --adapter ${ADAPTER_ARG}"; fi
230
+
231
+ if "${BASH_SOURCE[0]}" ${_REINVOKE_ARGS}; then
232
+ # Copy results to aggregation directory — find the child's results
233
+ # Try the marker file first (set by child), then fall back to ls -td
234
+ _LATEST_JOB_DIR=""
235
+ if [ -f "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" ]; then
236
+ _LATEST_JOB_DIR=$(cat "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null)
237
+ fi
238
+ if [ -z "${_LATEST_JOB_DIR}" ] || [ ! -d "${_LATEST_JOB_DIR}" ]; then
239
+ _LATEST_JOB_DIR=$(ls -td "${SCRIPT_DIR}/../benchmarks/${PROJECT_NAME}-benchmark-"* 2>/dev/null | head -1)
240
+ fi
241
+ if [ -n "${_LATEST_JOB_DIR}" ] && [ -d "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export.jsonl" ]; then
242
+ cp "${_LATEST_JOB_DIR}/output/profile_export.jsonl" "${_ALL_RESULTS_DIR}/profile-concurrency-${_LEVEL}.jsonl"
243
+ elif [ -n "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" ]; then
244
+ cp "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" "${_ALL_RESULTS_DIR}/results-concurrency-${_LEVEL}.json"
245
+ fi
246
+ else
247
+ echo "⚠️ Benchmark at concurrency ${_LEVEL} failed (non-fatal, continuing)"
248
+ _LEVEL_FAILURES=$((_LEVEL_FAILURES + 1))
249
+ fi
250
+ unset _BENCHMARK_SINGLE_LEVEL
251
+ echo ""
252
+ done
253
+
254
+ # Aggregate results into a combined JSON file for the benchmark writer
255
+ # Reads per-level JSONL files and computes aggregate metrics per concurrency level
256
+ echo "📊 Aggregating multi-level results..."
257
+ _COMBINED_FILE="${_ALL_RESULTS_DIR}/results.json"
258
+ python3 -c "
259
+ import json, glob, sys, os, math
260
+
261
+ def percentile(sorted_vals, pct):
262
+ if not sorted_vals:
263
+ return 0.0
264
+ idx = (pct / 100.0) * (len(sorted_vals) - 1)
265
+ lower = int(math.floor(idx))
266
+ upper = int(math.ceil(idx))
267
+ if lower == upper:
268
+ return sorted_vals[lower]
269
+ frac = idx - lower
270
+ return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac
271
+
272
+ def get_val(metrics, key):
273
+ m = metrics.get(key)
274
+ if isinstance(m, dict):
275
+ return m.get('value')
276
+ return m
277
+
278
+ results_dir = '${_ALL_RESULTS_DIR}'
279
+ combined = {'metrics': []}
280
+
281
+ # Process JSONL files (preferred)
282
+ for f in sorted(glob.glob(os.path.join(results_dir, 'profile-concurrency-*.jsonl'))):
283
+ try:
284
+ level = int(os.path.basename(f).replace('profile-concurrency-', '').replace('.jsonl', ''))
285
+ records = []
286
+ with open(f) as fp:
287
+ for line in fp:
288
+ line = line.strip()
289
+ if line:
290
+ records.append(json.loads(line))
291
+
292
+ if not records:
293
+ continue
294
+
295
+ # Aggregate per-request metrics
296
+ latencies, ttfts, itls, ttsts, out_tokens = [], [], [], [], []
297
+ start_times, end_times, in_tokens = [], [], []
298
+ prefill_tps, output_tps = [], []
299
+
300
+ for rec in records:
301
+ meta = rec.get('metadata', {})
302
+ metrics = rec.get('metrics', {})
303
+ lat = get_val(metrics, 'request_latency')
304
+ if lat is not None: latencies.append(lat)
305
+ ttft = get_val(metrics, 'time_to_first_token') or get_val(metrics, 'time_to_first_output_token')
306
+ if ttft is not None: ttfts.append(ttft)
307
+ itl = get_val(metrics, 'inter_token_latency')
308
+ if itl is not None: itls.append(itl)
309
+ ttst = get_val(metrics, 'time_to_second_token')
310
+ if ttst is not None: ttsts.append(ttst)
311
+ otc = get_val(metrics, 'output_token_count')
312
+ if otc is not None: out_tokens.append(otc)
313
+ isl = get_val(metrics, 'input_sequence_length')
314
+ if isl is not None: in_tokens.append(isl)
315
+ ptps = get_val(metrics, 'prefill_throughput_per_user')
316
+ if ptps is not None: prefill_tps.append(ptps)
317
+ otps = get_val(metrics, 'output_token_throughput_per_user')
318
+ if otps is not None: output_tps.append(otps)
319
+ rs = meta.get('request_start_ns')
320
+ re_ = meta.get('request_end_ns')
321
+ if rs: start_times.append(rs)
322
+ if re_: end_times.append(re_)
323
+
324
+ # Sort for percentiles
325
+ latencies.sort()
326
+ ttfts.sort()
327
+ itls.sort()
328
+ ttsts.sort()
329
+ prefill_tps.sort()
330
+ output_tps.sort()
331
+
332
+ # Compute throughput
333
+ duration_s = (max(end_times) - min(start_times)) / 1e9 if start_times and end_times else 1.0
334
+ duration_s = max(duration_s, 0.001)
335
+ req_throughput = len(records) / duration_s
336
+ token_throughput = sum(out_tokens) / duration_s if out_tokens else 0.0
337
+
338
+ entry = {
339
+ 'concurrency': level,
340
+ 'request_throughput': req_throughput,
341
+ 'output_token_throughput': token_throughput,
342
+ 'total_requests': len(records),
343
+ 'duration_seconds': duration_s,
344
+ 'time_to_first_token': {
345
+ 'avg': sum(ttfts)/len(ttfts) if ttfts else 0.0,
346
+ 'p50': percentile(ttfts, 50),
347
+ 'p90': percentile(ttfts, 90),
348
+ 'p99': percentile(ttfts, 99),
349
+ },
350
+ 'inter_token_latency': {
351
+ 'avg': sum(itls)/len(itls) if itls else 0.0,
352
+ 'p50': percentile(itls, 50),
353
+ 'p90': percentile(itls, 90),
354
+ 'p99': percentile(itls, 99),
355
+ },
356
+ 'e2e_latency': {
357
+ 'avg': sum(latencies)/len(latencies) if latencies else 0.0,
358
+ 'p50': percentile(latencies, 50),
359
+ 'p90': percentile(latencies, 90),
360
+ 'p99': percentile(latencies, 99),
361
+ },
362
+ 'time_to_second_token': {
363
+ 'p50': percentile(ttsts, 50),
364
+ 'p90': percentile(ttsts, 90),
365
+ },
366
+ 'prefill_throughput': {
367
+ 'avg': sum(prefill_tps)/len(prefill_tps) if prefill_tps else 0.0,
368
+ 'p50': percentile(prefill_tps, 50),
369
+ },
370
+ 'output_token_throughput_detail': {
371
+ 'avg': sum(output_tps)/len(output_tps) if output_tps else 0.0,
372
+ 'p50': percentile(output_tps, 50),
373
+ 'p90': percentile(output_tps, 90),
374
+ },
375
+ 'total_token_throughput': (sum(out_tokens) + sum(in_tokens)) / duration_s if (out_tokens or in_tokens) else 0.0,
376
+ 'output_sequence_length': sum(out_tokens)/len(out_tokens) if out_tokens else 0.0,
377
+ 'input_sequence_length': sum(in_tokens)/len(in_tokens) if in_tokens else 0.0,
378
+ 'request_count': len(records),
379
+ 'input_tokens_mean': ${BENCHMARK_INPUT_TOKENS_MEAN:-0},
380
+ 'output_tokens_mean': ${BENCHMARK_OUTPUT_TOKENS_MEAN:-0},
381
+ }
382
+ combined['metrics'].append(entry)
383
+ except Exception as e:
384
+ print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)
385
+
386
+ # Fallback: process old-style JSON files if no JSONL found
387
+ if not combined['metrics']:
388
+ for f in sorted(glob.glob(os.path.join(results_dir, 'results-concurrency-*.json'))):
389
+ try:
390
+ with open(f) as fp:
391
+ data = json.load(fp)
392
+ level = int(os.path.basename(f).replace('results-concurrency-', '').replace('.json', ''))
393
+ if isinstance(data, dict):
394
+ data['concurrency'] = level
395
+ combined['metrics'].append(data)
396
+ except Exception as e:
397
+ print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)
398
+
399
+ with open('${_COMBINED_FILE}', 'w') as fp:
400
+ try:
401
+ json.dump(combined, fp, indent=2)
402
+ except TypeError as te:
403
+ print(f'Warning: JSON serialize error: {str(te)}', file=sys.stderr)
404
+ fp.write(json.dumps({'metrics': []}, indent=2))
405
+ n_metrics = len(combined.get('metrics', []))
406
+ print(f'Combined {n_metrics} concurrency level results')
407
+ " 2>&1
408
+
409
+ # Persist to Athena if CI mode is active
410
+ if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ -f "${_COMBINED_FILE}" ]; then
411
+ echo ""
412
+ echo "📊 Persisting multi-level benchmark results to Athena..."
413
+
414
+ _compute_config_id() {
415
+ local input="${DEPLOYMENT_CONFIG}:${MODEL_NAME:-none}:${INSTANCE_TYPE}:${AWS_REGION}:${DEPLOYMENT_TARGET}:ic${IC_COUNT:-1}:adapt${ADAPTER_COUNT:-0}"
416
+ if command -v sha256sum &> /dev/null; then
417
+ echo -n "$input" | sha256sum | cut -c1-16
418
+ else
419
+ echo -n "$input" | shasum -a 256 | cut -c1-16
420
+ fi
421
+ }
422
+ CONFIG_ID=$(_compute_config_id)
423
+
424
+ if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
425
+ --results-file "${_COMBINED_FILE}" \
426
+ --config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
427
+ --project-name "${PROJECT_NAME}" \
428
+ --workload "${BENCHMARK_WORKLOAD:-manual}" \
429
+ --bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
430
+ --region "${AWS_REGION:-${REGION}}"; then
431
+ echo "✅ Multi-level benchmark results persisted to S3"
432
+ else
433
+ echo "⚠️ Failed to persist multi-level benchmark results to Athena (non-fatal)"
434
+ fi
435
+ fi
436
+
437
+ echo ""
438
+ echo "📋 Multi-level Summary:"
439
+ echo " Levels tested: ${_NORMALIZED_LEVELS}"
440
+ echo " Failures: ${_LEVEL_FAILURES} / ${#_LEVELS[@]}"
441
+ echo " Results: ${_ALL_RESULTS_DIR}/"
442
+
443
+ if [ ${_LEVEL_FAILURES} -ge ${#_LEVELS[@]} ]; then
444
+ echo "❌ All concurrency levels failed"
445
+ exit 1
446
+ fi
447
+ exit 0
448
+ fi
449
+ fi
450
+
451
+ # ── _check_schema_registry_staleness() ────────────────────────────────────────
452
+ # Warn if the schema registry manifest's lastSynced timestamp is older than threshold.
453
+ # Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
454
+ # Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
455
+ _check_schema_registry_staleness() {
456
+ if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
457
+ return 0
458
+ fi
459
+ local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
460
+ local manifest_file="${HOME}/.ml-container-creator/schemas/manifest.json"
461
+ if [ ! -f "${manifest_file}" ]; then
462
+ return 0
463
+ fi
464
+ local last_synced
465
+ last_synced=$(python3 -c "
466
+ import json, sys
467
+ from datetime import datetime, timezone
468
+ try:
469
+ with open('${manifest_file}') as f:
470
+ manifest = json.load(f)
471
+ ls = manifest.get('lastSynced', '')
472
+ if not ls:
473
+ sys.exit(0)
474
+ synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
475
+ days = (datetime.now(timezone.utc) - synced).days
476
+ if days > int('${threshold}'):
477
+ print(days)
478
+ except:
479
+ pass
480
+ " 2>/dev/null)
481
+ if [ -n "${last_synced}" ]; then
482
+ echo "⚠️ Schema registry is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-schemas' to update."
483
+ fi
484
+ }
485
+
486
+ _check_schema_registry_staleness
487
+
57
488
  # ── Verify AWS CLI v2 ─────────────────────────────────────────────────────────
58
489
  if ! aws --version 2>&1 | grep -q "aws-cli/2"; then
59
490
  echo "❌ AWS CLI v2 is required for benchmarking."
@@ -185,7 +616,7 @@ if [ "${FORCE}" = false ] && [ -n "${BENCHMARK_JOB_NAME:-}" ]; then
185
616
  fi
186
617
 
187
618
  # ── Configuration ─────────────────────────────────────────────────────────────
188
- WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config"
619
+ WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
189
620
  if [ "${RESUME_EXISTING}" = false ]; then
190
621
  BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
191
622
  fi
@@ -357,6 +788,7 @@ if [ -n "${EXISTING_CONFIG_SPEC}" ]; then
357
788
 
358
789
  if [ "${EXISTING_NORMALIZED}" = "${DESIRED_NORMALIZED}" ]; then
359
790
  echo " ✅ Existing workload config matches current parameters — reusing"
791
+ CREATE_WORKLOAD_CONFIG=false
360
792
  else
361
793
  echo " ⚠️ Workload config parameters changed — recreating..."
362
794
  aws sagemaker delete-ai-workload-config \
@@ -484,10 +916,11 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
484
916
  # Persist results locally to benchmarks/<job-name>/
485
917
  PROJECT_ROOT="${SCRIPT_DIR}/.."
486
918
  LOCAL_RESULTS_DIR="${PROJECT_ROOT}/benchmarks/${BENCHMARK_JOB_NAME}"
487
- RESULTS_FILE="${LOCAL_RESULTS_DIR}/results.json"
919
+ RESULTS_JSONL="${LOCAL_RESULTS_DIR}/output/profile_export.jsonl"
920
+ RESULTS_FILE="${LOCAL_RESULTS_DIR}/output/profile_export_aiperf.json"
488
921
 
489
922
  # Check if results already exist locally (idempotency: skip S3 download)
490
- if [ -f "${RESULTS_FILE}" ]; then
923
+ if [ -f "${RESULTS_JSONL}" ] || [ -f "${RESULTS_FILE}" ]; then
491
924
  echo "📥 Step 4: Results already available locally"
492
925
  RESULTS_DOWNLOADED=true
493
926
  else
@@ -513,12 +946,27 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
513
946
  # This is the most reliable approach — handles any subdirectory structure
514
947
  echo " Syncing results from S3..."
515
948
  if aws s3 sync "${RESULTS_S3_PATH}" "${LOCAL_RESULTS_DIR}/" --region "${AWS_REGION}" 2>/dev/null; then
516
- # Look for any JSON file in the synced directory tree
517
- FOUND_FILE=$(find "${LOCAL_RESULTS_DIR}" -name "*.json" -type f 2>/dev/null | head -1)
518
- if [ -n "${FOUND_FILE}" ]; then
519
- # If the found file isn't already at our canonical path, copy it there
520
- if [ "${FOUND_FILE}" != "${RESULTS_FILE}" ]; then
521
- cp "${FOUND_FILE}" "${RESULTS_FILE}"
949
+ # Extract any tar.gz archives (benchmark service packages results as output.tar.gz)
950
+ for ARCHIVE in $(find "${LOCAL_RESULTS_DIR}" -name "*.tar.gz" -type f 2>/dev/null); do
951
+ ARCHIVE_DIR=$(dirname "${ARCHIVE}")
952
+ tar -xzf "${ARCHIVE}" -C "${ARCHIVE_DIR}" 2>/dev/null || true
953
+ done
954
+
955
+ # Look for specific result files (priority: JSONL > aiperf JSON)
956
+ _FOUND_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
957
+ _FOUND_JSON=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export_aiperf.json" -type f 2>/dev/null | head -1)
958
+
959
+ if [ -n "${_FOUND_JSONL}" ]; then
960
+ if [ "${_FOUND_JSONL}" != "${RESULTS_JSONL}" ]; then
961
+ mkdir -p "$(dirname "${RESULTS_JSONL}")"
962
+ cp "${_FOUND_JSONL}" "${RESULTS_JSONL}"
963
+ fi
964
+ RESULTS_DOWNLOADED=true
965
+ fi
966
+ if [ -n "${_FOUND_JSON}" ]; then
967
+ if [ "${_FOUND_JSON}" != "${RESULTS_FILE}" ]; then
968
+ mkdir -p "$(dirname "${RESULTS_FILE}")"
969
+ cp "${_FOUND_JSON}" "${RESULTS_FILE}"
522
970
  fi
523
971
  RESULTS_DOWNLOADED=true
524
972
  fi
@@ -531,33 +979,25 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
531
979
  RESULTS_BUCKET=$(echo "${RESULTS_S3_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
532
980
  RESULTS_PREFIX=$(echo "${RESULTS_S3_PATH}" | sed "s|s3://${RESULTS_BUCKET}/||")
533
981
 
534
- # List all objects under the output path and find data files
535
- # aws s3api list-objects-v2 is more reliable than aws s3 ls --recursive
536
- FOUND_KEY=$(aws s3api list-objects-v2 \
982
+ # List all objects and look for our target files
983
+ _ALL_KEYS=$(aws s3api list-objects-v2 \
537
984
  --bucket "${RESULTS_BUCKET}" \
538
985
  --prefix "${RESULTS_PREFIX}" \
539
986
  --region "${AWS_REGION}" \
540
987
  --query 'Contents[].Key' \
541
- --output text 2>/dev/null \
542
- | tr '\t' '\n' \
543
- | grep -E '\.(json|jsonl|csv)$' \
544
- | head -1)
545
-
546
- if [ -n "${FOUND_KEY}" ] && [ "${FOUND_KEY}" != "None" ]; then
547
- if aws s3 cp "s3://${RESULTS_BUCKET}/${FOUND_KEY}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null; then
548
- RESULTS_DOWNLOADED=true
549
- fi
550
- fi
551
- fi
988
+ --output text 2>/dev/null | tr '\t' '\n')
552
989
 
553
- # Strategy 3: If still nothing, try direct path patterns the service might use
554
- if [ "${RESULTS_DOWNLOADED}" = false ]; then
555
- for PATTERN in "results.json" "benchmark_results.json" "output.json"; do
556
- if aws s3 cp "${RESULTS_S3_PATH}${PATTERN}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null; then
557
- RESULTS_DOWNLOADED=true
558
- break
559
- fi
560
- done
990
+ _JSONL_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export\.jsonl$" | head -1)
991
+ _JSON_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export_aiperf\.json$" | head -1)
992
+
993
+ if [ -n "${_JSONL_KEY}" ] && [ "${_JSONL_KEY}" != "None" ]; then
994
+ mkdir -p "$(dirname "${RESULTS_JSONL}")"
995
+ aws s3 cp "s3://${RESULTS_BUCKET}/${_JSONL_KEY}" "${RESULTS_JSONL}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
996
+ fi
997
+ if [ -n "${_JSON_KEY}" ] && [ "${_JSON_KEY}" != "None" ]; then
998
+ mkdir -p "$(dirname "${RESULTS_FILE}")"
999
+ aws s3 cp "s3://${RESULTS_BUCKET}/${_JSON_KEY}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
1000
+ fi
561
1001
  fi
562
1002
  fi
563
1003
 
@@ -573,72 +1013,156 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
573
1013
  echo "║ Endpoint: ${ENDPOINT_NAME}"
574
1014
  echo "╠══════════════════════════════════════════════════════════════════╣"
575
1015
 
576
- # Parse and display metrics using built-in tools
577
- # Extract key metrics from the results JSON
1016
+ # Parse and display metrics from profile_export.jsonl (rich per-request data)
578
1017
  if command -v python3 &>/dev/null; then
579
1018
  python3 -c "
580
- import json, sys
1019
+ import json, sys, os, math
1020
+
1021
+ def percentile(sorted_vals, pct):
1022
+ if not sorted_vals:
1023
+ return None
1024
+ idx = (pct / 100.0) * (len(sorted_vals) - 1)
1025
+ lower = int(math.floor(idx))
1026
+ upper = int(math.ceil(idx))
1027
+ if lower == upper:
1028
+ return sorted_vals[lower]
1029
+ frac = idx - lower
1030
+ return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac
1031
+
1032
+ def fmt(val, suffix=''):
1033
+ if val is None:
1034
+ return 'N/A'
1035
+ return f'{val:.2f}{suffix}'
581
1036
 
582
1037
  try:
583
- with open('${RESULTS_FILE}') as f:
584
- data = json.load(f)
585
-
586
- metrics = data if isinstance(data, dict) else {}
587
-
588
- # Helper to safely get nested values
589
- def get_metric(d, *keys):
590
- for k in keys:
591
- if isinstance(d, dict):
592
- d = d.get(k, 'N/A')
593
- else:
594
- return 'N/A'
595
- return d
596
-
597
- # Display throughput
598
- throughput = get_metric(metrics, 'request_throughput')
599
- output_throughput = get_metric(metrics, 'output_token_throughput')
600
- print(f'║ Request Throughput: {throughput} req/s')
601
- print(f'║ Output Token Throughput: {output_throughput} tokens/s')
602
- print('║')
603
-
604
- # Display request latency
605
- lat_p50 = get_metric(metrics, 'request_latency', 'p50')
606
- lat_p90 = get_metric(metrics, 'request_latency', 'p90')
607
- lat_p99 = get_metric(metrics, 'request_latency', 'p99')
608
- print(f'║ Request Latency (ms):')
609
- print(f'║ P50: {lat_p50} P90: {lat_p90} P99: {lat_p99}')
610
- print('║')
611
-
612
- # Display TTFT (time to first token)
613
- ttft_p50 = get_metric(metrics, 'time_to_first_token', 'p50')
614
- ttft_p90 = get_metric(metrics, 'time_to_first_token', 'p90')
615
- ttft_p99 = get_metric(metrics, 'time_to_first_token', 'p99')
616
- print(f'║ Time to First Token (ms):')
617
- print(f'║ P50: {ttft_p50} P90: {ttft_p90} P99: {ttft_p99}')
618
- print('║')
619
-
620
- # Display ITL (inter-token latency)
621
- itl_p50 = get_metric(metrics, 'inter_token_latency', 'p50')
622
- itl_p90 = get_metric(metrics, 'inter_token_latency', 'p90')
623
- itl_p99 = get_metric(metrics, 'inter_token_latency', 'p99')
624
- print(f'║ Inter-Token Latency (ms):')
625
- print(f'║ P50: {itl_p50} P90: {itl_p90} P99: {itl_p99}')
1038
+ jsonl_path = '${RESULTS_JSONL}'
1039
+ json_path = '${RESULTS_FILE}'
1040
+ records = []
1041
+
1042
+ # Primary: read profile_export.jsonl (rich per-request data)
1043
+ if os.path.exists(jsonl_path):
1044
+ with open(jsonl_path) as f:
1045
+ for line in f:
1046
+ line = line.strip()
1047
+ if line:
1048
+ try:
1049
+ records.append(json.loads(line))
1050
+ except json.JSONDecodeError:
1051
+ continue
1052
+
1053
+ if records:
1054
+ # Extract scalar values from metric dicts {"value": X, "unit": "..."}
1055
+ def get_val(metrics, key):
1056
+ m = metrics.get(key)
1057
+ if isinstance(m, dict):
1058
+ return m.get('value')
1059
+ return m
1060
+
1061
+ # Collect per-request metrics
1062
+ latencies = []
1063
+ ttfts = []
1064
+ itls = []
1065
+ ttsts = []
1066
+ output_tokens = []
1067
+ start_times = []
1068
+ end_times = []
1069
+
1070
+ for rec in records:
1071
+ meta = rec.get('metadata', {})
1072
+ metrics = rec.get('metrics', {})
1073
+
1074
+ lat = get_val(metrics, 'request_latency')
1075
+ if lat is not None:
1076
+ latencies.append(lat)
1077
+
1078
+ ttft = get_val(metrics, 'time_to_first_token')
1079
+ if ttft is None:
1080
+ ttft = get_val(metrics, 'time_to_first_output_token')
1081
+ if ttft is not None:
1082
+ ttfts.append(ttft)
1083
+
1084
+ itl = get_val(metrics, 'inter_token_latency')
1085
+ if itl is not None:
1086
+ itls.append(itl)
1087
+
1088
+ ttst = get_val(metrics, 'time_to_second_token')
1089
+ if ttst is not None:
1090
+ ttsts.append(ttst)
1091
+
1092
+ otc = get_val(metrics, 'output_token_count')
1093
+ if otc is not None:
1094
+ output_tokens.append(otc)
1095
+
1096
+ # Track timing for throughput calculation
1097
+ rs = meta.get('request_start_ns')
1098
+ re_ = meta.get('request_end_ns')
1099
+ if rs is not None:
1100
+ start_times.append(rs)
1101
+ if re_ is not None:
1102
+ end_times.append(re_)
1103
+
1104
+ n = len(records)
1105
+
1106
+ # Compute system throughput
1107
+ if start_times and end_times:
1108
+ duration_ns = max(end_times) - min(start_times)
1109
+ duration_s = duration_ns / 1e9 if duration_ns > 0 else 1.0
1110
+ req_throughput = n / duration_s
1111
+ total_out_tokens = sum(output_tokens) if output_tokens else 0
1112
+ token_throughput = total_out_tokens / duration_s
1113
+ else:
1114
+ req_throughput = None
1115
+ token_throughput = None
1116
+
1117
+ # Compute percentiles
1118
+ latencies.sort()
1119
+ ttfts.sort()
1120
+ itls.sort()
1121
+ ttsts.sort()
1122
+
1123
+ print(f'║ Requests: {n}')
1124
+ print(f'║ Request Throughput: {fmt(req_throughput)} req/s')
1125
+ print(f'║ Output Token Throughput: {fmt(token_throughput)} tokens/s')
1126
+ print('║')
1127
+ print('║ Time to First Token (ms):')
1128
+ print(f'║ Avg: {fmt(sum(ttfts)/len(ttfts) if ttfts else None)} P50: {fmt(percentile(ttfts, 50))} P90: {fmt(percentile(ttfts, 90))} P99: {fmt(percentile(ttfts, 99))}')
1129
+ print('║')
1130
+ print('║ Inter-Token Latency (ms):')
1131
+ print(f'║ Avg: {fmt(sum(itls)/len(itls) if itls else None)} P50: {fmt(percentile(itls, 50))} P90: {fmt(percentile(itls, 90))} P99: {fmt(percentile(itls, 99))}')
1132
+ print('║')
1133
+ print('║ Request Latency (ms):')
1134
+ print(f'║ Avg: {fmt(sum(latencies)/len(latencies) if latencies else None)} P50: {fmt(percentile(latencies, 50))} P90: {fmt(percentile(latencies, 90))} P99: {fmt(percentile(latencies, 99))}')
1135
+ print('║')
1136
+ print('║ Time to Second Token (ms):')
1137
+ print(f'║ Avg: {fmt(sum(ttsts)/len(ttsts) if ttsts else None)} P50: {fmt(percentile(ttsts, 50))} P90: {fmt(percentile(ttsts, 90))} P99: {fmt(percentile(ttsts, 99))}')
1138
+
1139
+ else:
1140
+ print('║ ⚠️ No JSONL results found — cannot display metrics')
1141
+ print(f'║ Expected: {jsonl_path}')
626
1142
 
627
1143
  except Exception as e:
628
1144
  print(f'║ ⚠️ Could not parse results: {e}')
629
- print(f'║ Raw file: ${RESULTS_FILE}')
1145
+ import traceback
1146
+ traceback.print_exc(file=sys.stderr)
630
1147
  "
631
1148
  else
632
1149
  # Fallback: display raw JSON if python3 is not available
633
1150
  echo "║ (python3 not available — showing raw results)"
634
1151
  echo "║"
635
- cat "${RESULTS_FILE}" | head -50
1152
+ if [ -f "${RESULTS_JSONL}" ]; then
1153
+ head -3 "${RESULTS_JSONL}"
1154
+ elif [ -f "${RESULTS_FILE}" ]; then
1155
+ cat "${RESULTS_FILE}" | head -50
1156
+ fi
636
1157
  fi
637
1158
 
638
1159
  echo "╚══════════════════════════════════════════════════════════════════╝"
639
1160
  echo ""
640
1161
  echo "📁 Results saved to: benchmarks/${BENCHMARK_JOB_NAME}/"
641
1162
  echo "☁️ S3 results: ${RESULTS_S3_PATH:-${BENCHMARK_S3_OUTPUT_PATH}}"
1163
+
1164
+ # Write marker for multi-level parent to find this results dir
1165
+ echo "${LOCAL_RESULTS_DIR}" > "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null || true
642
1166
  else
643
1167
  echo "⚠️ Could not download results from S3"
644
1168
  echo " The benchmark completed but results could not be located."
@@ -661,6 +1185,36 @@ except Exception as e:
661
1185
  --output table 2>/dev/null || echo " (could not list objects)"
662
1186
  fi
663
1187
 
1188
+ # ── Persist benchmark results to Athena ──────────────────────────────────
1189
+ # When CI_BENCHMARK_RESULTS_BUCKET is set (from bootstrap config), call the
1190
+ # benchmark writer to persist results as Parquet to S3 for Athena querying.
1191
+ # Skip when running as a child of multi-level mode — the parent orchestrator
1192
+ # handles combined persistence (one row per concurrency level, no duplicates).
1193
+ if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ "${RESULTS_DOWNLOADED}" = true ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
1194
+ echo ""
1195
+ echo "📊 Persisting benchmark results to Athena..."
1196
+
1197
+ # Determine which results file to pass to the writer (prefer JSONL)
1198
+ _WRITER_INPUT="${RESULTS_JSONL}"
1199
+ if [ ! -f "${_WRITER_INPUT}" ]; then
1200
+ _WRITER_INPUT="${RESULTS_FILE}"
1201
+ fi
1202
+
1203
+ # Best-effort: errors are logged but do not fail the benchmark script
1204
+ if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
1205
+ --results-file "${_WRITER_INPUT}" \
1206
+ --config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
1207
+ --project-name "${PROJECT_NAME}" \
1208
+ --workload "${BENCHMARK_WORKLOAD:-manual}" \
1209
+ --concurrency "${BENCHMARK_CONCURRENCY}" \
1210
+ --bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
1211
+ --region "${AWS_REGION:-${REGION}}"; then
1212
+ echo "✅ Benchmark results persisted to S3"
1213
+ else
1214
+ echo "⚠️ Failed to persist benchmark results to Athena (non-fatal)"
1215
+ echo " Results remain available locally in: benchmarks/${BENCHMARK_JOB_NAME}/"
1216
+ fi
1217
+ fi
664
1218
  elif [ "${JOB_STATUS}" = "Failed" ]; then
665
1219
  # Display failure reason
666
1220
  echo "❌ Step 4: Benchmark job failed"