@aws/ml-container-creator 0.9.1 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +2049 -0
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +53 -68
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +138 -138
- package/servers/instance-sizer/lib/instance-ranker.js +76 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/jumpstart-public.json +101 -16
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/catalogs/models.json +182 -26
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +4 -390
- package/src/lib/bootstrap-command-handler.js +710 -1148
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +641 -0
- package/src/lib/bootstrap-provisioners.js +421 -0
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +408 -0
- package/src/lib/config-manager.js +66 -1685
- package/src/lib/config-mcp-client.js +118 -0
- package/src/lib/config-validator.js +634 -0
- package/src/lib/cuda-resolver.js +149 -0
- package/src/lib/e2e-catalog-validator.js +251 -3
- package/src/lib/e2e-ci-recorder.js +103 -0
- package/src/lib/generated/cli-options.js +315 -311
- package/src/lib/generated/parameter-matrix.js +671 -0
- package/src/lib/generated/validation-rules.js +71 -71
- package/src/lib/marketplace-flow.js +276 -0
- package/src/lib/mcp-query-runner.js +768 -0
- package/src/lib/parameter-schema-validator.js +62 -18
- package/src/lib/path-prover-brain.js +607 -0
- package/src/lib/prompt-runner.js +41 -1504
- package/src/lib/prompts/feature-prompts.js +172 -0
- package/src/lib/prompts/index.js +48 -0
- package/src/lib/prompts/infrastructure-prompts.js +690 -0
- package/src/lib/prompts/model-prompts.js +552 -0
- package/src/lib/prompts/project-prompts.js +82 -0
- package/src/lib/prompts.js +2 -1446
- package/src/lib/registry-command-handler.js +135 -3
- package/src/lib/secrets-prompt-runner.js +251 -0
- package/src/lib/template-variable-resolver.js +422 -0
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +149 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/config +108 -5
- package/templates/do/deploy.d/managed-inference.ejs +192 -11
- package/templates/do/optimize +106 -37
- package/templates/do/register +89 -0
- package/templates/do/test +13 -0
- package/templates/do/tune +378 -59
- package/templates/do/validate +44 -4
- package/config/parameter-schema.json +0 -88
package/templates/do/tune
CHANGED
|
@@ -40,6 +40,11 @@ ARG_STATUS=false
|
|
|
40
40
|
ARG_HELP=false
|
|
41
41
|
ARG_DRY_RUN=false
|
|
42
42
|
ARG_LIST_MODELS=false
|
|
43
|
+
ARG_NO_STALE_WARNING=false
|
|
44
|
+
ARG_DISCOVER=false
|
|
45
|
+
ARG_DISCOVER_FILTER=""
|
|
46
|
+
ARG_COLUMN_MAP=""
|
|
47
|
+
ARG_ACCEPT_EULA=false
|
|
43
48
|
|
|
44
49
|
|
|
45
50
|
# ── _parse_args() ─────────────────────────────────────────────────────────────
|
|
@@ -132,11 +137,27 @@ _parse_args() {
|
|
|
132
137
|
fi
|
|
133
138
|
ARG_ROLE="$2"; shift 2 ;;
|
|
134
139
|
--force) ARG_FORCE=true; shift ;;
|
|
140
|
+
--accept-eula) ARG_ACCEPT_EULA=true; shift ;;
|
|
135
141
|
--no-wait) ARG_NO_WAIT=true; shift ;;
|
|
136
142
|
--status) ARG_STATUS=true; shift ;;
|
|
137
143
|
--help|-h) ARG_HELP=true; shift ;;
|
|
138
144
|
--dry-run) ARG_DRY_RUN=true; shift ;;
|
|
139
145
|
--list-models) ARG_LIST_MODELS=true; shift ;;
|
|
146
|
+
--no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
|
|
147
|
+
--column-map)
|
|
148
|
+
if [ -z "${2:-}" ]; then
|
|
149
|
+
echo "❌ --column-map requires a value (e.g., prompt=question,completion=answer)"
|
|
150
|
+
exit 1
|
|
151
|
+
fi
|
|
152
|
+
ARG_COLUMN_MAP="$2"; shift 2 ;;
|
|
153
|
+
--discover)
|
|
154
|
+
ARG_DISCOVER=true
|
|
155
|
+
shift
|
|
156
|
+
if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
|
|
157
|
+
ARG_DISCOVER_FILTER="$1"
|
|
158
|
+
shift
|
|
159
|
+
fi
|
|
160
|
+
;;
|
|
140
161
|
*)
|
|
141
162
|
echo "❌ Unknown option: $1"
|
|
142
163
|
echo " Run ./do/tune --help for usage."
|
|
@@ -150,6 +171,8 @@ _parse_args() {
|
|
|
150
171
|
# ── _show_help() ──────────────────────────────────────────────────────────────
|
|
151
172
|
_show_help() {
|
|
152
173
|
echo "Usage: ./do/tune --technique <technique> --dataset <source> [options]"
|
|
174
|
+
echo " ./do/tune --model <id> --technique <technique> --dataset <source>"
|
|
175
|
+
echo " ./do/tune --discover [filter]"
|
|
153
176
|
echo " ./do/tune --status"
|
|
154
177
|
echo " ./do/tune --list-models"
|
|
155
178
|
echo " ./do/tune --help"
|
|
@@ -157,10 +180,48 @@ _show_help() {
|
|
|
157
180
|
echo "SageMaker AI Managed Model Customization — fine-tune supported foundation"
|
|
158
181
|
echo "models using SFT, DPO, RLAIF, or RLVR without managing infrastructure."
|
|
159
182
|
echo ""
|
|
183
|
+
echo "How it works:"
|
|
184
|
+
echo ""
|
|
185
|
+
echo " ┌─────────────────────────────────────────────────────────────────┐"
|
|
186
|
+
echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
|
|
187
|
+
echo " │ ↓ │"
|
|
188
|
+
echo " │ HuggingFace model (deploy) ←──── do/adapter add │"
|
|
189
|
+
echo " │ ↓ │"
|
|
190
|
+
echo " │ vLLM loads adapter at runtime │"
|
|
191
|
+
echo " └─────────────────────────────────────────────────────────────────┘"
|
|
192
|
+
echo ""
|
|
193
|
+
echo " Managed fine-tuning uses a JumpStart Hub model (identified by TUNE_MODEL_ID)"
|
|
194
|
+
echo " to produce LoRA adapter weights. These adapters are then attached to your"
|
|
195
|
+
echo " HuggingFace BYOC deployment via do/adapter add — no redeployment needed."
|
|
196
|
+
echo ""
|
|
197
|
+
echo "Supported model families:"
|
|
198
|
+
echo " • qwen-2.5 (Alibaba) — Qwen 2.5 7B/14B/32B/72B Instruct"
|
|
199
|
+
echo " • qwen-3 (Alibaba) — Qwen 3 0.6B/1.7B/4B/8B/14B/32B"
|
|
200
|
+
echo " • llama-3 (Meta) — Llama 3.1 8B, 3.2 1B/3B, 3.3 70B Instruct"
|
|
201
|
+
echo " • deepseek-r1 (DeepSeek) — R1 Distill Llama 8B/70B, Qwen 1.5B/7B/14B/32B"
|
|
202
|
+
echo " • gpt-oss (OpenAI) — GPT-OSS 20B/120B"
|
|
203
|
+
echo ""
|
|
204
|
+
echo " Only models registered in the SageMaker JumpStart Hub support managed"
|
|
205
|
+
echo " fine-tuning. Not all HuggingFace models have a JumpStart equivalent."
|
|
206
|
+
echo ""
|
|
207
|
+
echo "Finding your JumpStart model ID:"
|
|
208
|
+
echo ""
|
|
209
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
210
|
+
echo " --hub-content-type Model \\"
|
|
211
|
+
echo " --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
|
|
212
|
+
echo ""
|
|
213
|
+
echo " Or use: ./do/tune --discover [filter]"
|
|
214
|
+
echo ""
|
|
160
215
|
echo "Required:"
|
|
161
216
|
echo " --technique <t> Customization technique: sft, dpo, rlaif, rlvr"
|
|
162
217
|
echo " --dataset <source> Dataset: s3://bucket/path.jsonl or hf://org/name[/split]"
|
|
163
218
|
echo ""
|
|
219
|
+
echo "Model selection:"
|
|
220
|
+
echo " --model <id> JumpStart Hub content name to use for fine-tuning."
|
|
221
|
+
echo " Takes precedence over TUNE_MODEL_ID in do/config."
|
|
222
|
+
echo " Accepts the Hub content name as-is (no catalog lookup)."
|
|
223
|
+
echo " Example: --model huggingface-reasoning-qwen3-8b"
|
|
224
|
+
echo ""
|
|
164
225
|
echo "Training type:"
|
|
165
226
|
echo " --training-type <t> lora (default) or full-rank"
|
|
166
227
|
echo ""
|
|
@@ -177,26 +238,55 @@ _show_help() {
|
|
|
177
238
|
echo " --reward-prompt <uri> S3 URI for reward prompt file"
|
|
178
239
|
echo ""
|
|
179
240
|
echo "Overrides:"
|
|
180
|
-
echo " --model <id> Override model (defaults to MODEL_ID from do/config)"
|
|
181
241
|
echo " --output-bucket <b> Override output bucket (defaults to TUNE_S3_BUCKET)"
|
|
182
242
|
echo " --role <arn> Override execution role (defaults to ROLE_ARN)"
|
|
183
243
|
echo ""
|
|
184
244
|
echo "Job control:"
|
|
185
245
|
echo " --force Force new job even if one exists for this technique"
|
|
246
|
+
echo " --accept-eula Accept model EULA (required for gated models like Llama)"
|
|
186
247
|
echo " --no-wait Submit and exit without polling for completion"
|
|
187
248
|
echo " --status Show status of all tracked tune jobs"
|
|
188
249
|
echo ""
|
|
250
|
+
echo "Discovery and diagnostics:"
|
|
251
|
+
echo " --discover [filter] Query JumpStart Hub for tune-eligible models."
|
|
252
|
+
echo " Without a filter, shows models for the current family."
|
|
253
|
+
echo " With a filter, narrows results by keyword."
|
|
254
|
+
echo " --no-stale-warning Suppress catalog staleness warnings (useful for CI)."
|
|
255
|
+
echo " Also suppressed by MCC_NO_STALE_WARNING=true env var."
|
|
256
|
+
echo ""
|
|
189
257
|
echo "Informational:"
|
|
190
258
|
echo " --help, -h Show this help message"
|
|
191
259
|
echo " --dry-run Validate inputs and show what would be submitted"
|
|
192
260
|
echo " --list-models Print supported models, techniques, and training types"
|
|
193
261
|
echo ""
|
|
194
262
|
echo "Examples:"
|
|
263
|
+
echo " # Fine-tune with pre-configured TUNE_MODEL_ID from do/config:"
|
|
195
264
|
echo " ./do/tune --technique sft --dataset s3://my-bucket/train.jsonl"
|
|
265
|
+
echo ""
|
|
266
|
+
echo " # Override model ID directly:"
|
|
267
|
+
echo " ./do/tune --model huggingface-reasoning-qwen3-8b --technique sft --dataset s3://bucket/data.jsonl"
|
|
268
|
+
echo ""
|
|
269
|
+
echo " # Use a HuggingFace dataset:"
|
|
196
270
|
echo " ./do/tune --technique dpo --dataset hf://my-org/pref-data --learning-rate 1e-5"
|
|
271
|
+
echo ""
|
|
272
|
+
echo " # Fine-tune a gated model (Meta Llama) — requires EULA acceptance:"
|
|
273
|
+
echo " ./do/tune --technique dpo --dataset hf://argilla/ultrafeedback-binarized-preferences-cleaned --accept-eula"
|
|
274
|
+
echo ""
|
|
275
|
+
echo " # Discover available models:"
|
|
276
|
+
echo " ./do/tune --discover # Models for current family"
|
|
277
|
+
echo " ./do/tune --discover qwen # Filter by keyword"
|
|
278
|
+
echo ""
|
|
279
|
+
echo " # Other:"
|
|
197
280
|
echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --training-type full-rank"
|
|
198
|
-
echo " ./do/tune --status"
|
|
199
281
|
echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --dry-run"
|
|
282
|
+
echo " ./do/tune --status"
|
|
283
|
+
echo ""
|
|
284
|
+
echo "Configuration:"
|
|
285
|
+
echo " TUNE_MODEL_ID is set in do/config at generation time when a matching"
|
|
286
|
+
echo " JumpStart model is found for your HuggingFace model. If not set, use"
|
|
287
|
+
echo " --model <id> or run --discover to find the correct Hub content name."
|
|
288
|
+
echo ""
|
|
289
|
+
echo " For custom training without JumpStart, see: ./do/train --help"
|
|
200
290
|
exit 0
|
|
201
291
|
}
|
|
202
292
|
|
|
@@ -213,7 +303,7 @@ _show_status() {
|
|
|
213
303
|
|
|
214
304
|
if [ -n "${job_name}" ]; then
|
|
215
305
|
found_any=true
|
|
216
|
-
echo " ${technique
|
|
306
|
+
echo " $(echo "${technique}" | tr "[:lower:]" "[:upper:]"):"
|
|
217
307
|
echo " Job: ${job_name}"
|
|
218
308
|
|
|
219
309
|
# Query status via Python helper
|
|
@@ -318,66 +408,203 @@ _update_config_var() {
|
|
|
318
408
|
fi
|
|
319
409
|
}
|
|
320
410
|
|
|
321
|
-
# ──
|
|
322
|
-
#
|
|
411
|
+
# ── _check_catalog_staleness() ─────────────────────────────────────────────────
|
|
412
|
+
# Warn if the tune catalog's lastSynced timestamp is older than the threshold.
|
|
413
|
+
# Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
|
|
414
|
+
# Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
|
|
415
|
+
_check_catalog_staleness() {
|
|
416
|
+
if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
|
|
417
|
+
return 0
|
|
418
|
+
fi
|
|
419
|
+
local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
|
|
420
|
+
local last_synced
|
|
421
|
+
last_synced=$(python3 -c "
|
|
422
|
+
import json, sys
|
|
423
|
+
from datetime import datetime, timezone
|
|
424
|
+
try:
|
|
425
|
+
with open('${CATALOG_FILE}') as f:
|
|
426
|
+
catalog = json.load(f)
|
|
427
|
+
ls = catalog.get('lastSynced', '')
|
|
428
|
+
if not ls:
|
|
429
|
+
sys.exit(0)
|
|
430
|
+
synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
|
|
431
|
+
days = (datetime.now(timezone.utc) - synced).days
|
|
432
|
+
if days > int('${threshold}'):
|
|
433
|
+
print(days)
|
|
434
|
+
except:
|
|
435
|
+
pass
|
|
436
|
+
" 2>/dev/null)
|
|
437
|
+
if [ -n "${last_synced}" ]; then
|
|
438
|
+
echo "⚠️ Tune catalog is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-model-families' to update."
|
|
439
|
+
fi
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
# ── _resolve_tune_model() ─────────────────────────────────────────────────────
|
|
443
|
+
# Resolve the JumpStart Hub content name for managed fine-tuning.
|
|
444
|
+
# Priority: --model flag > TUNE_MODEL_ID config > discovery
|
|
323
445
|
# Sets RESOLVED_MODEL_ID on success.
|
|
324
|
-
|
|
325
|
-
#
|
|
446
|
+
_resolve_tune_model() {
|
|
447
|
+
# Priority 1: --model flag (format-check only, no catalog validation)
|
|
326
448
|
if [ -n "${ARG_MODEL}" ]; then
|
|
449
|
+
if ! echo "${ARG_MODEL}" | grep -qE '^[a-zA-Z0-9](-*[a-zA-Z0-9])*$'; then
|
|
450
|
+
echo "❌ Invalid model ID format: ${ARG_MODEL}"
|
|
451
|
+
echo " Hub content names must match: [a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}"
|
|
452
|
+
exit 1
|
|
453
|
+
fi
|
|
327
454
|
RESOLVED_MODEL_ID="${ARG_MODEL}"
|
|
328
|
-
|
|
329
|
-
RESOLVED_MODEL_ID="${MODEL_ID}"
|
|
330
|
-
elif [ -n "${MODEL_NAME:-}" ]; then
|
|
331
|
-
RESOLVED_MODEL_ID="${MODEL_NAME}"
|
|
332
|
-
else
|
|
333
|
-
echo "❌ No model configured"
|
|
334
|
-
echo " Set MODEL_ID in do/config or use --model <id>"
|
|
335
|
-
exit 1
|
|
455
|
+
return 0
|
|
336
456
|
fi
|
|
337
457
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
458
|
+
# Priority 2: TUNE_MODEL_ID from do/config
|
|
459
|
+
if [ -n "${TUNE_MODEL_ID:-}" ]; then
|
|
460
|
+
RESOLVED_MODEL_ID="${TUNE_MODEL_ID}"
|
|
461
|
+
return 0
|
|
342
462
|
fi
|
|
343
463
|
|
|
344
|
-
#
|
|
345
|
-
|
|
346
|
-
|
|
464
|
+
# Priority 3: Neither set — attempt runtime discovery, then show guidance
|
|
465
|
+
_discover_and_guide
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
# ── _discover_and_guide() ─────────────────────────────────────────────────────
|
|
469
|
+
# Display guidance when no model ID is configured and attempt runtime discovery.
|
|
470
|
+
# Attempts Hub discovery via helper script, falls back to static guidance on failure.
|
|
471
|
+
_discover_and_guide() {
|
|
472
|
+
echo ""
|
|
473
|
+
echo "🔧 SageMaker AI Managed Model Customization"
|
|
474
|
+
echo ""
|
|
475
|
+
echo " This feature uses SageMaker Serverless Fine-Tuning, which requires"
|
|
476
|
+
echo " the model to be registered in the SageMaker JumpStart Hub."
|
|
477
|
+
echo ""
|
|
478
|
+
echo " Your deployed model: ${MODEL_NAME:-unknown} (HuggingFace BYOC)"
|
|
479
|
+
echo " JumpStart model ID: (not configured)"
|
|
480
|
+
echo ""
|
|
481
|
+
|
|
482
|
+
# Derive model family from the catalog based on MODEL_NAME (HuggingFace ID)
|
|
483
|
+
local model_family=""
|
|
484
|
+
if [ -f "${CATALOG_FILE}" ] && [ -n "${MODEL_NAME:-}" ]; then
|
|
485
|
+
model_family=$(python3 -c "
|
|
347
486
|
import json, sys
|
|
487
|
+
try:
|
|
488
|
+
with open('${CATALOG_FILE}') as f:
|
|
489
|
+
catalog = json.load(f)
|
|
490
|
+
model_name = '${MODEL_NAME}'
|
|
491
|
+
for entry in catalog.get('models', {}).values():
|
|
492
|
+
if entry.get('huggingFaceId', '') == model_name:
|
|
493
|
+
print(entry.get('family', ''))
|
|
494
|
+
sys.exit(0)
|
|
495
|
+
except:
|
|
496
|
+
pass
|
|
497
|
+
" 2>/dev/null) || model_family=""
|
|
498
|
+
fi
|
|
348
499
|
|
|
349
|
-
|
|
350
|
-
|
|
500
|
+
# Attempt runtime discovery via helper script
|
|
501
|
+
local discover_result=""
|
|
502
|
+
if [ -f "${HELPER_SCRIPT}" ]; then
|
|
503
|
+
discover_result=$(python3 "${HELPER_SCRIPT}" discover \
|
|
504
|
+
--family "${model_family}" \
|
|
505
|
+
--region "${AWS_REGION}" 2>/dev/null) || discover_result=""
|
|
506
|
+
fi
|
|
351
507
|
|
|
352
|
-
|
|
353
|
-
|
|
508
|
+
if [ -n "${discover_result}" ] && echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d.get('models') else 1)" 2>/dev/null; then
|
|
509
|
+
echo " 📋 Suggested models for your family:"
|
|
510
|
+
echo "${discover_result}" | python3 -c "
|
|
511
|
+
import sys, json
|
|
512
|
+
d = json.load(sys.stdin)
|
|
513
|
+
for m in d.get('models', [])[:5]:
|
|
514
|
+
print(f' • {m}')
|
|
515
|
+
" 2>/dev/null
|
|
516
|
+
echo ""
|
|
517
|
+
fi
|
|
354
518
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
"
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
519
|
+
echo " To find your model's JumpStart ID:"
|
|
520
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
521
|
+
echo " --hub-content-type Model --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
|
|
522
|
+
echo ""
|
|
523
|
+
echo " Then run:"
|
|
524
|
+
echo " ./do/tune --model <jumpstart-id> --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET}"
|
|
525
|
+
echo ""
|
|
526
|
+
echo " Or set it permanently in do/config:"
|
|
527
|
+
echo " export TUNE_MODEL_ID=\"<jumpstart-id>\""
|
|
528
|
+
echo ""
|
|
529
|
+
echo " ┌─────────────────────────────────────────────────────────────┐"
|
|
530
|
+
echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
|
|
531
|
+
echo " │ ↓ │"
|
|
532
|
+
echo " │ HuggingFace model (deploy) ←── do/adapter add │"
|
|
533
|
+
echo " │ ↓ │"
|
|
534
|
+
echo " │ vLLM loads adapter at runtime │"
|
|
535
|
+
echo " └─────────────────────────────────────────────────────────────┘"
|
|
536
|
+
echo ""
|
|
537
|
+
echo " For custom training without JumpStart, see: ./do/train --help"
|
|
538
|
+
exit 3
|
|
539
|
+
}
|
|
366
540
|
|
|
367
|
-
|
|
368
|
-
|
|
541
|
+
# ── _run_discover() ───────────────────────────────────────────────────────────
|
|
542
|
+
# Explicit --discover mode: query the JumpStart Hub and display tune-eligible models.
|
|
543
|
+
# Accepts an optional filter keyword to narrow results.
|
|
544
|
+
_run_discover() {
|
|
545
|
+
local filter="${1:-}"
|
|
546
|
+
|
|
547
|
+
echo ""
|
|
548
|
+
echo "🔍 Discovering tune-eligible models in SageMaker JumpStart Hub"
|
|
549
|
+
echo " Region: ${AWS_REGION}"
|
|
550
|
+
if [ -n "${filter}" ]; then
|
|
551
|
+
echo " Filter: ${filter}"
|
|
552
|
+
elif [ -n "${MODEL_FAMILY:-}" ]; then
|
|
553
|
+
echo " Family: ${MODEL_FAMILY}"
|
|
554
|
+
fi
|
|
555
|
+
echo ""
|
|
556
|
+
|
|
557
|
+
# Build discover arguments
|
|
558
|
+
local discover_args=(
|
|
559
|
+
--region "${AWS_REGION}"
|
|
560
|
+
)
|
|
561
|
+
if [ -n "${filter}" ]; then
|
|
562
|
+
discover_args+=(--filter "${filter}")
|
|
563
|
+
elif [ -n "${MODEL_FAMILY:-}" ]; then
|
|
564
|
+
discover_args+=(--family "${MODEL_FAMILY}")
|
|
369
565
|
fi
|
|
370
566
|
|
|
371
|
-
#
|
|
372
|
-
local
|
|
373
|
-
|
|
567
|
+
# Call helper script discover subcommand
|
|
568
|
+
local discover_result
|
|
569
|
+
discover_result=$(python3 "${HELPER_SCRIPT}" discover "${discover_args[@]}" 2>&1) || {
|
|
570
|
+
echo "❌ Discovery failed"
|
|
571
|
+
echo " ${discover_result}"
|
|
572
|
+
echo ""
|
|
573
|
+
echo " Ensure AWS credentials are configured and you have sagemaker:ListHubContents permission."
|
|
574
|
+
echo ""
|
|
575
|
+
echo " Manual alternative:"
|
|
576
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
577
|
+
echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
|
|
578
|
+
exit 1
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
# Parse and display results
|
|
582
|
+
local count
|
|
583
|
+
count=$(echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count', 0))" 2>/dev/null) || count="0"
|
|
374
584
|
|
|
375
|
-
|
|
376
|
-
|
|
585
|
+
if [ "${count}" = "0" ]; then
|
|
586
|
+
echo " No tune-eligible models found."
|
|
587
|
+
echo ""
|
|
588
|
+
echo " Try a different filter or check available models manually:"
|
|
589
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
590
|
+
echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
|
|
591
|
+
else
|
|
592
|
+
echo " 📋 Tune-eligible models (${count} found):"
|
|
593
|
+
echo ""
|
|
594
|
+
echo "${discover_result}" | python3 -c "
|
|
595
|
+
import sys, json
|
|
596
|
+
d = json.load(sys.stdin)
|
|
597
|
+
for m in d.get('models', []):
|
|
598
|
+
print(f' • {m}')
|
|
599
|
+
" 2>/dev/null
|
|
600
|
+
echo ""
|
|
601
|
+
echo " Use with:"
|
|
602
|
+
echo " ./do/tune --model <id> --technique <sft|dpo|rlaif|rlvr> --dataset <source>"
|
|
603
|
+
echo ""
|
|
604
|
+
echo " Or set permanently in do/config:"
|
|
605
|
+
echo " export TUNE_MODEL_ID=\"<id>\""
|
|
606
|
+
fi
|
|
377
607
|
echo ""
|
|
378
|
-
echo " Additional model support and custom training workflows are expected in future releases."
|
|
379
|
-
echo " For custom training workflows, see \`do/train\`."
|
|
380
|
-
exit 1
|
|
381
608
|
}
|
|
382
609
|
|
|
383
610
|
# ── _validate_technique() ─────────────────────────────────────────────────────
|
|
@@ -546,9 +773,17 @@ _validate_dataset() {
|
|
|
546
773
|
elif [[ "${dataset}" == hf://* ]]; then
|
|
547
774
|
# Hugging Face dataset — parse reference and stage to S3
|
|
548
775
|
local hf_path="${dataset#hf://}"
|
|
776
|
+
local hf_file=""
|
|
777
|
+
|
|
778
|
+
# Extract ?file= parameter before parsing path components
|
|
779
|
+
if [[ "${hf_path}" == *"?file="* ]]; then
|
|
780
|
+
hf_file="${hf_path#*?file=}"
|
|
781
|
+
hf_path="${hf_path%%\?file=*}"
|
|
782
|
+
fi
|
|
783
|
+
|
|
549
784
|
local hf_org hf_name hf_split
|
|
550
785
|
|
|
551
|
-
# Parse org/name/split
|
|
786
|
+
# Parse org/name/split from the cleaned path
|
|
552
787
|
hf_org=$(echo "${hf_path}" | cut -d'/' -f1)
|
|
553
788
|
hf_name=$(echo "${hf_path}" | cut -d'/' -f2)
|
|
554
789
|
hf_split=$(echo "${hf_path}" | cut -d'/' -f3-)
|
|
@@ -583,9 +818,16 @@ _validate_dataset() {
|
|
|
583
818
|
if [ -n "${HF_TOKEN_ARN:-}" ]; then
|
|
584
819
|
stage_args+=(--hf-secret-name "${HF_TOKEN_ARN}")
|
|
585
820
|
fi
|
|
821
|
+
if [ -n "${ARG_COLUMN_MAP}" ]; then
|
|
822
|
+
stage_args+=(--column-map "${ARG_COLUMN_MAP}")
|
|
823
|
+
fi
|
|
824
|
+
stage_args+=(--technique "${ARG_TECHNIQUE}")
|
|
825
|
+
if [ -n "${hf_file}" ]; then
|
|
826
|
+
stage_args+=(--hf-file "${hf_file}")
|
|
827
|
+
fi
|
|
586
828
|
|
|
587
829
|
local stage_result
|
|
588
|
-
stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}"
|
|
830
|
+
stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}") || {
|
|
589
831
|
local error_msg
|
|
590
832
|
error_msg=$(echo "${stage_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Failed to stage dataset'))" 2>/dev/null) || error_msg="Failed to stage HF dataset"
|
|
591
833
|
echo "❌ ${error_msg}"
|
|
@@ -663,7 +905,7 @@ _check_idempotency() {
|
|
|
663
905
|
return 0 # No existing job or --force: proceed with new job
|
|
664
906
|
fi
|
|
665
907
|
|
|
666
|
-
echo "🔍 Found existing ${ARG_TECHNIQUE
|
|
908
|
+
echo "🔍 Found existing $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") job: ${existing_job}"
|
|
667
909
|
|
|
668
910
|
# Query status via Python helper
|
|
669
911
|
local status_json
|
|
@@ -752,7 +994,26 @@ _submit_job() {
|
|
|
752
994
|
timestamp=$(date +%Y%m%d-%H%M%S)
|
|
753
995
|
JOB_NAME="${PROJECT_NAME}-tune-${ARG_TECHNIQUE}-${timestamp}"
|
|
754
996
|
|
|
755
|
-
|
|
997
|
+
# Check if model requires EULA acceptance (gated models from Meta, etc.)
|
|
998
|
+
if [ "${ARG_ACCEPT_EULA}" != true ]; then
|
|
999
|
+
local model_provider
|
|
1000
|
+
model_provider=$(python3 -c "
|
|
1001
|
+
import json
|
|
1002
|
+
with open('${CATALOG_FILE}') as f:
|
|
1003
|
+
catalog = json.load(f)
|
|
1004
|
+
entry = catalog.get('models', {}).get('${RESOLVED_MODEL_ID}', {})
|
|
1005
|
+
print(entry.get('provider', ''))
|
|
1006
|
+
" 2>/dev/null) || model_provider=""
|
|
1007
|
+
if [ "${model_provider}" = "meta" ]; then
|
|
1008
|
+
echo "⚠️ ${RESOLVED_MODEL_ID} is a gated model that requires EULA acceptance."
|
|
1009
|
+
echo " Add --accept-eula to proceed:"
|
|
1010
|
+
echo " ./do/tune --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET} --accept-eula"
|
|
1011
|
+
echo ""
|
|
1012
|
+
exit 1
|
|
1013
|
+
fi
|
|
1014
|
+
fi
|
|
1015
|
+
|
|
1016
|
+
echo "🚀 Submitting $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") customization job"
|
|
756
1017
|
echo " Job name: ${JOB_NAME}"
|
|
757
1018
|
echo " Model: ${RESOLVED_MODEL_ID}"
|
|
758
1019
|
echo " Technique: ${ARG_TECHNIQUE}"
|
|
@@ -764,6 +1025,7 @@ _submit_job() {
|
|
|
764
1025
|
# Build submit arguments
|
|
765
1026
|
local submit_args=(
|
|
766
1027
|
--model-id "${RESOLVED_MODEL_ID}"
|
|
1028
|
+
--region "${AWS_REGION}"
|
|
767
1029
|
--technique "${ARG_TECHNIQUE}"
|
|
768
1030
|
--training-type "${ARG_TRAINING_TYPE}"
|
|
769
1031
|
--dataset-s3-uri "${RESOLVED_DATASET_S3_URI}"
|
|
@@ -801,15 +1063,54 @@ _submit_job() {
|
|
|
801
1063
|
if [ -n "${ARG_REWARD_PROMPT}" ]; then
|
|
802
1064
|
submit_args+=(--reward-prompt "${ARG_REWARD_PROMPT}")
|
|
803
1065
|
fi
|
|
1066
|
+
if [ "${ARG_ACCEPT_EULA}" = true ]; then
|
|
1067
|
+
submit_args+=(--accept-eula)
|
|
1068
|
+
fi
|
|
804
1069
|
|
|
805
|
-
# Invoke Python helper
|
|
1070
|
+
# Invoke Python helper (stderr visible to user for diagnostics)
|
|
806
1071
|
local submit_result
|
|
807
|
-
|
|
1072
|
+
local submit_stderr
|
|
1073
|
+
submit_stderr=$(mktemp)
|
|
1074
|
+
submit_result=$(python3 "${HELPER_SCRIPT}" submit "${submit_args[@]}" 2>"${submit_stderr}") || {
|
|
808
1075
|
echo "❌ Failed to submit customization job"
|
|
809
|
-
echo "
|
|
1076
|
+
echo " Model ID used: ${RESOLVED_MODEL_ID}"
|
|
1077
|
+
echo ""
|
|
1078
|
+
# Show stderr from helper script
|
|
1079
|
+
if [ -s "${submit_stderr}" ]; then
|
|
1080
|
+
echo " Error output:"
|
|
1081
|
+
sed 's/^/ /' "${submit_stderr}"
|
|
1082
|
+
echo ""
|
|
1083
|
+
# Check for ResourceNotFound and suggest verification
|
|
1084
|
+
if grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found" "${submit_stderr}"; then
|
|
1085
|
+
echo " 💡 The model ID may not exist in the JumpStart Hub."
|
|
1086
|
+
echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
1087
|
+
echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
|
|
1088
|
+
echo ""
|
|
1089
|
+
fi
|
|
1090
|
+
fi
|
|
1091
|
+
# Show stdout error JSON if available
|
|
1092
|
+
if [ -n "${submit_result:-}" ]; then
|
|
1093
|
+
local err_msg
|
|
1094
|
+
err_msg=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',''))" 2>/dev/null) || err_msg=""
|
|
1095
|
+
if [ -n "${err_msg}" ]; then
|
|
1096
|
+
echo " SDK error: ${err_msg}"
|
|
1097
|
+
echo ""
|
|
1098
|
+
fi
|
|
1099
|
+
fi
|
|
1100
|
+
rm -f "${submit_stderr}"
|
|
810
1101
|
exit 1
|
|
811
1102
|
}
|
|
812
1103
|
|
|
1104
|
+
# Show any stderr warnings from helper script (non-fatal)
|
|
1105
|
+
if [ -s "${submit_stderr}" ]; then
|
|
1106
|
+
sed 's/^/ ⚠️ /' "${submit_stderr}"
|
|
1107
|
+
fi
|
|
1108
|
+
rm -f "${submit_stderr}"
|
|
1109
|
+
|
|
1110
|
+
# SDK may print status lines to stdout before our JSON (e.g., "Training Job Name: ...")
|
|
1111
|
+
# Extract only the JSON line (last line starting with '{')
|
|
1112
|
+
submit_result=$(echo "${submit_result}" | grep '^{' | tail -1)
|
|
1113
|
+
|
|
813
1114
|
# Check for error in response
|
|
814
1115
|
local has_error
|
|
815
1116
|
has_error=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print('yes' if 'error' in d else 'no')" 2>/dev/null) || has_error="yes"
|
|
@@ -818,6 +1119,14 @@ _submit_job() {
|
|
|
818
1119
|
local error_msg
|
|
819
1120
|
error_msg=$(echo "${submit_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Unknown error'))" 2>/dev/null) || error_msg="Unknown error"
|
|
820
1121
|
echo "❌ ${error_msg}"
|
|
1122
|
+
echo " Model ID used: ${RESOLVED_MODEL_ID}"
|
|
1123
|
+
# Check for ResourceNotFound in the error message
|
|
1124
|
+
if echo "${error_msg}" | grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found"; then
|
|
1125
|
+
echo ""
|
|
1126
|
+
echo " 💡 The model ID may not exist in the JumpStart Hub."
|
|
1127
|
+
echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
1128
|
+
echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
|
|
1129
|
+
fi
|
|
821
1130
|
exit 1
|
|
822
1131
|
fi
|
|
823
1132
|
|
|
@@ -1084,6 +1393,12 @@ if [ "${ARG_STATUS}" = true ]; then
|
|
|
1084
1393
|
_show_status
|
|
1085
1394
|
fi
|
|
1086
1395
|
|
|
1396
|
+
# Handle --discover flag (before requiring --technique and --dataset)
|
|
1397
|
+
if [ "${ARG_DISCOVER}" = true ]; then
|
|
1398
|
+
_run_discover "${ARG_DISCOVER_FILTER}"
|
|
1399
|
+
exit 0
|
|
1400
|
+
fi
|
|
1401
|
+
|
|
1087
1402
|
# Validate required arguments for job submission
|
|
1088
1403
|
if [ -z "${ARG_TECHNIQUE}" ]; then
|
|
1089
1404
|
echo "❌ --technique is required"
|
|
@@ -1099,11 +1414,14 @@ if [ -z "${ARG_DATASET}" ]; then
|
|
|
1099
1414
|
exit 1
|
|
1100
1415
|
fi
|
|
1101
1416
|
|
|
1102
|
-
#
|
|
1103
|
-
if [ "${TUNE_SUPPORTED:-}"
|
|
1104
|
-
echo "⚠️ Managed customization is not supported for the configured model."
|
|
1105
|
-
echo " Checking catalog for current support..."
|
|
1417
|
+
# Golden-path gating — check TUNE_SUPPORTED before any model resolution
|
|
1418
|
+
if [ "${TUNE_SUPPORTED:-}" != "true" ]; then
|
|
1106
1419
|
echo ""
|
|
1420
|
+
echo "❌ Managed fine-tuning is not available for this model family."
|
|
1421
|
+
echo ""
|
|
1422
|
+
echo " Use ./do/train for custom fine-tuning."
|
|
1423
|
+
echo ""
|
|
1424
|
+
exit 1
|
|
1107
1425
|
fi
|
|
1108
1426
|
|
|
1109
1427
|
# Validate Python availability
|
|
@@ -1117,7 +1435,8 @@ fi
|
|
|
1117
1435
|
echo "🔧 SageMaker AI Managed Model Customization"
|
|
1118
1436
|
echo ""
|
|
1119
1437
|
|
|
1120
|
-
|
|
1438
|
+
_check_catalog_staleness
|
|
1439
|
+
_resolve_tune_model
|
|
1121
1440
|
_validate_technique
|
|
1122
1441
|
_validate_training_type
|
|
1123
1442
|
_validate_dataset
|