@aws/ml-container-creator 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +10 -1
  2. package/bin/cli.js +57 -0
  3. package/config/agent.json +16 -0
  4. package/infra/ci-harness/lib/ci-harness-stack.ts +43 -0
  5. package/package.json +5 -2
  6. package/pyproject.toml +3 -0
  7. package/servers/agent-knowledge/index.js +592 -0
  8. package/servers/agent-knowledge/package.json +15 -0
  9. package/servers/base-image-picker/index.js +65 -18
  10. package/servers/instance-sizer/index.js +32 -0
  11. package/servers/lib/catalogs/fleet-drivers.json +38 -0
  12. package/servers/lib/catalogs/model-arch-support.json +51 -0
  13. package/servers/lib/catalogs/model-servers.json +2842 -1730
  14. package/servers/lib/schemas/image-catalog.schema.json +12 -0
  15. package/src/agent/__init__.py +2 -0
  16. package/src/agent/__pycache__/__init__.cpython-312.pyc +0 -0
  17. package/src/agent/__pycache__/config_loader.cpython-312.pyc +0 -0
  18. package/src/agent/__pycache__/context.cpython-312.pyc +0 -0
  19. package/src/agent/__pycache__/health_check.cpython-312.pyc +0 -0
  20. package/src/agent/agent.py +513 -0
  21. package/src/agent/config_loader.py +215 -0
  22. package/src/agent/context.py +380 -0
  23. package/src/agent/data/capability-matrix.json +106 -0
  24. package/src/agent/health_check.py +341 -0
  25. package/src/agent/prompts/system.md +173 -0
  26. package/src/agent/requirements-agent.txt +3 -0
  27. package/src/app.js +6 -4
  28. package/src/lib/generated/cli-options.js +1 -1
  29. package/src/lib/generated/parameter-matrix.js +1 -1
  30. package/src/lib/generated/validation-rules.js +1 -1
  31. package/src/lib/mcp-query-runner.js +110 -3
  32. package/src/lib/prompt-runner.js +66 -22
  33. package/src/lib/template-variable-resolver.js +8 -0
  34. package/src/lib/train-config-builder.js +339 -0
  35. package/src/lib/tune-config-state.js +89 -68
  36. package/templates/do/.benchmark_writer.py +3 -0
  37. package/templates/do/.eval_helper.py +409 -0
  38. package/templates/do/.register_helper.py +185 -11
  39. package/templates/do/.train_build_request.py +102 -113
  40. package/templates/do/.train_helper.py +433 -0
  41. package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
  42. package/templates/do/adapter +157 -0
  43. package/templates/do/benchmark +60 -3
  44. package/templates/do/config +6 -1
  45. package/templates/do/deploy.d/managed-inference.ejs +83 -0
  46. package/templates/do/evaluate +272 -0
  47. package/templates/do/lib/resolve-instance.sh +155 -0
  48. package/templates/do/register +5 -0
  49. package/templates/do/test +1 -0
  50. package/templates/do/train +879 -126
  51. package/templates/do/training/config.yaml +83 -11
  52. package/templates/do/training/dpo/accelerate_config.yaml +24 -0
  53. package/templates/do/training/dpo/defaults.yaml +26 -0
  54. package/templates/do/training/dpo/prompts.json +8 -0
  55. package/templates/do/training/dpo/train.py +363 -0
  56. package/templates/do/training/sft/accelerate_config.yaml +22 -0
  57. package/templates/do/training/sft/defaults.yaml +18 -0
  58. package/templates/do/training/sft/prompts.json +7 -0
  59. package/templates/do/training/sft/train.py +310 -0
  60. package/templates/do/tune +11 -2
  61. package/src/lib/auto-prompt-builder.js +0 -172
  62. package/src/lib/cli-handler.js +0 -529
  63. package/src/lib/community-reports-validator.js +0 -91
  64. package/src/lib/configuration-exporter.js +0 -204
  65. package/src/lib/dataset-slug.js +0 -152
  66. package/src/lib/docker-introspection-validator.js +0 -51
  67. package/src/lib/known-flags-validator.js +0 -200
  68. package/src/lib/schema-validator.js +0 -157
  69. package/src/lib/train-config-parser.js +0 -136
  70. package/src/lib/train-config-persistence.js +0 -143
  71. package/src/lib/train-config-validator.js +0 -112
  72. package/src/lib/train-feedback.js +0 -46
  73. package/src/lib/train-idempotency.js +0 -97
  74. package/src/lib/train-request-builder.js +0 -120
  75. package/src/lib/tune-dataset-validator.js +0 -279
  76. package/src/lib/tune-output-resolver.js +0 -66
  77. package/templates/do/.train_poll_parser.py +0 -135
  78. package/templates/do/.train_status_parser.py +0 -187
  79. /package/templates/do/training/{train.py → custom/train.py} +0 -0
@@ -0,0 +1,106 @@
1
+ {
2
+ "version": "1.0",
3
+ "capabilities": {
4
+ "vllm.realtime-inference.deploy": {
5
+ "status": "green",
6
+ "message": "Fully validated. 11 models (0.6B-8B) through happy path on g5.xlarge. 3× 14B models validated on g5.24xlarge (TP=4) with FP8 quantization: Qwen2.5-14B-Instruct, DeepSeek-R1-Distill-Qwen-14B, Qwen3-14B."
7
+ },
8
+ "vllm.realtime-inference.lora": {
9
+ "status": "green",
10
+ "message": "LoRA adapters via SageMaker Adapter ICs. Hot-swap validated on models up to 8B. 14B+ with LoRA requires FP8 quantization on A10G instances due to CUDA graph memory overhead."
11
+ },
12
+ "vllm.realtime-inference.multi-gpu": {
13
+ "status": "green",
14
+ "message": "Tensor parallelism validated on g5.24xlarge (4× A10G) and g5.12xlarge (4× A10G). TP degree auto-detected. Note: vLLM v0.20.2+ CUDA graph profiler reserves ~5.7 GiB per GPU — factor this into VRAM calculations."
15
+ },
16
+ "vllm.realtime-inference.quantization-fp8": {
17
+ "status": "green",
18
+ "message": "FP8 quantization validated. ~2x throughput improvement with minimal quality loss on supported models."
19
+ },
20
+ "training.custom.sft": {
21
+ "status": "green",
22
+ "message": "Supervised fine-tuning via SageMaker Training Jobs. Single-node multi-GPU validated."
23
+ },
24
+ "training.custom.dpo": {
25
+ "status": "green",
26
+ "message": "DPO/RLHF training pipeline validated. Preference dataset format documented."
27
+ },
28
+ "benchmarking.do-benchmark": {
29
+ "status": "green",
30
+ "message": "End-to-end benchmark pipeline: deploy, load-test, collect metrics, upload to S3/Glue. Athena-queryable results."
31
+ },
32
+ "registry.model-registration": {
33
+ "status": "green",
34
+ "message": "Model registration to SageMaker Model Registry with metadata, lineage, and approval workflows."
35
+ },
36
+ "registry.dataset-registration": {
37
+ "status": "green",
38
+ "message": "Dataset registration with versioning. Supports HuggingFace Hub and S3 sources."
39
+ },
40
+ "sglang.realtime-inference.deploy": {
41
+ "status": "yellow",
42
+ "message": "Base inference functional with RadixAttention. Lightly validated — 3 models tested. No LoRA support.",
43
+ "alternatives": ["Use vllm engine for production workloads with full validation coverage"]
44
+ },
45
+ "hyperpod.deploy": {
46
+ "status": "yellow",
47
+ "message": "HyperPod deployment functional but limited to single-node configurations. Multi-node orchestration not implemented.",
48
+ "alternatives": ["Use SageMaker realtime endpoints via vllm engine for validated multi-GPU inference"]
49
+ },
50
+ "optimization.do-optimize": {
51
+ "status": "yellow",
52
+ "message": "do/optimize is functional but lightly validated. Recommendations may be overly conservative or miss edge cases.",
53
+ "alternatives": ["Run do/benchmark manually and compare results across configurations"]
54
+ },
55
+ "vllm.realtime-inference.large-model-single-gpu": {
56
+ "status": "yellow",
57
+ "message": "14B+ parameter models on A10G GPUs (24GB) are memory-constrained even with TP=4. vLLM v0.20.2+ CUDA graph profiler reserves ~5.7 GiB/GPU, and LoRA pre-allocation (max_loras=30) consumes additional headroom. At FP16 with LoRA enabled, 14B OOMs on g5 instances. FP8 quantization is required.",
58
+ "alternatives": ["Enable FP8 quantization (IC_ENV_VLLM_QUANTIZATION=fp8) — halves model memory", "Reduce max_loras from 30 to 4 and max_lora_rank from 64 to 16", "Disable LoRA entirely for inference-only workloads", "Use L40S (48GB) or A100 (40/80GB) instances instead"]
59
+ },
60
+ "training.spot": {
61
+ "status": "yellow",
62
+ "message": "Spot training supported but checkpoint resume is inconsistent across interruptions. Manual monitoring recommended.",
63
+ "alternatives": ["Use on-demand instances for critical training runs", "Enable frequent checkpointing (every 100 steps)"]
64
+ },
65
+ "vllm.realtime-inference.gpu-memory-utilization-semantics": {
66
+ "status": "yellow",
67
+ "message": "vLLM v0.20.2+ changed gpu_memory_utilization semantics. CUDA graph memory profiler reserves ~5.7 GiB/GPU automatically. Setting 0.92 is effectively 0.66 after graph reservation. For 14B+ models on A10G, set 0.95-0.97. For models that still OOM, the issue is usually LoRA pre-allocation (max_loras × max_lora_rank), not the utilization setting itself.",
68
+ "alternatives": ["Set gpu_memory_utilization=0.95-0.97 for tight-fit models", "Reduce max_loras and max_lora_rank", "Use FP8 quantization to halve model footprint"]
69
+ },
70
+ "sglang.realtime-inference.lora": {
71
+ "status": "red",
72
+ "message": "adapter_sidecar.py raises NotImplementedError. SGLang LoRA adapter routing is a stub.",
73
+ "alternatives": ["Use vllm engine for LoRA workloads — fully validated with hot-swap"],
74
+ "unblock_spec": "e9-s1-lora-validation",
75
+ "estimated_effort": "~2 weeks"
76
+ },
77
+ "training.multi-node": {
78
+ "status": "red",
79
+ "message": "Multi-node distributed training not implemented. NCCL cross-node communication and data sharding logic missing.",
80
+ "alternatives": ["Use single-node multi-GPU (up to 8x A100 on p4d.24xlarge)", "Use SageMaker HyperPod for managed multi-node (separate workflow)"],
81
+ "unblock_spec": "e12-multi-node-training",
82
+ "estimated_effort": "~3 weeks"
83
+ },
84
+ "vllm.realtime-inference.speculative-decoding": {
85
+ "status": "red",
86
+ "message": "Speculative decoding configuration exists in vLLM but integration with IC environment variables not implemented.",
87
+ "alternatives": ["Use standard autoregressive decoding", "Use FP8 quantization for throughput improvement"],
88
+ "unblock_spec": "e14-speculative-decoding",
89
+ "estimated_effort": "~1 week"
90
+ },
91
+ "workflow.do-import": {
92
+ "status": "red",
93
+ "message": "do/import script is a placeholder. Model import from external registries not wired up.",
94
+ "alternatives": ["Manually download model with huggingface-cli and stage via do/stage"],
95
+ "unblock_spec": "e10-import-workflow",
96
+ "estimated_effort": "~2 weeks"
97
+ },
98
+ "workflow.do-regenerate": {
99
+ "status": "red",
100
+ "message": "do/regenerate is not implemented. Project re-generation from updated templates requires manual recreation.",
101
+ "alternatives": ["Create a new project and copy do/config settings over"],
102
+ "unblock_spec": "e11-regenerate",
103
+ "estimated_effort": "~1 week"
104
+ }
105
+ }
106
+ }
@@ -0,0 +1,341 @@
1
+ """Environment health check for ml-container-creator.
2
+
3
+ Runs at startup to verify the tool is installed correctly and the
4
+ environment meets prerequisites. No LLM needed — pure code checks.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import importlib.metadata
10
+ import json
11
+ import os
12
+ import re
13
+ import subprocess
14
+ import sys
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+
19
+ @dataclass
20
+ class HealthItem:
21
+ """Single health check result."""
22
+
23
+ status: str # "pass", "warn", "fail"
24
+ label: str
25
+ message: str
26
+
27
+ @property
28
+ def icon(self) -> str:
29
+ """Colored status indicator for terminal output."""
30
+ icons = {"pass": "\033[32m✓\033[0m", "warn": "\033[33m⚠\033[0m", "fail": "\033[31m✗\033[0m"}
31
+ return icons.get(self.status, "?")
32
+
33
+ def __str__(self) -> str:
34
+ return f" {self.icon} {self.label}: {self.message}"
35
+
36
+
37
+ # Path to the bootstrap profile config
38
+ _BOOTSTRAP_CONFIG_PATH = Path.home() / ".ml-container-creator" / "config.json"
39
+
40
+ # Required pip packages for core functionality
41
+ _REQUIRED_PACKAGES = ["sagemaker", "boto3", "huggingface_hub"]
42
+
43
+ # Minimum versions
44
+ _MIN_PYTHON = (3, 10)
45
+ _MIN_NODE = 24
46
+
47
+
48
+ class EnvironmentHealthCheck:
49
+ """Check environment prerequisites at startup.
50
+
51
+ No LLM needed. Verifies that ml-container-creator is installed
52
+ correctly and the environment is properly configured.
53
+ """
54
+
55
+ def run(self, project_dir: str | None = None) -> list[HealthItem]:
56
+ """Run all health checks.
57
+
58
+ Args:
59
+ project_dir: Path to a project directory (contains do/config).
60
+ If None, only environment-level checks run.
61
+
62
+ Returns:
63
+ List of HealthItem results, one per check.
64
+ """
65
+ items: list[HealthItem] = []
66
+ items.append(self._check_python_version())
67
+ items.append(self._check_node_version())
68
+ items.append(self._check_pip_packages())
69
+ items.append(self._check_bootstrap_profile())
70
+ items.append(self._check_aws_credentials())
71
+ items.append(self._check_mcp_servers())
72
+ if project_dir:
73
+ items.append(self._check_secrets_configured(project_dir))
74
+ items.append(self._check_benchmark_infra())
75
+ return items
76
+
77
+ def _check_python_version(self) -> HealthItem:
78
+ """Check sys.version_info >= (3, 10)."""
79
+ current = sys.version_info[:2]
80
+ version_str = f"{current[0]}.{current[1]}"
81
+ if current >= _MIN_PYTHON:
82
+ return HealthItem("pass", "Python version", f"{version_str} (>= 3.10)")
83
+ return HealthItem(
84
+ "fail",
85
+ "Python version",
86
+ f"{version_str} — requires >= 3.10",
87
+ )
88
+
89
+ def _check_node_version(self) -> HealthItem:
90
+ """Check node --version >= 24 via subprocess."""
91
+ try:
92
+ result = subprocess.run(
93
+ ["node", "--version"],
94
+ capture_output=True,
95
+ text=True,
96
+ timeout=10,
97
+ )
98
+ if result.returncode != 0:
99
+ return HealthItem("fail", "Node.js version", "node command failed")
100
+
101
+ # Parse version string like "v24.1.0" or "v22.12.0"
102
+ version_output = result.stdout.strip()
103
+ match = re.match(r"v?(\d+)\.(\d+)\.(\d+)", version_output)
104
+ if not match:
105
+ return HealthItem("warn", "Node.js version", f"Could not parse: {version_output}")
106
+
107
+ major = int(match.group(1))
108
+ if major >= _MIN_NODE:
109
+ return HealthItem("pass", "Node.js version", f"{version_output} (>= 24)")
110
+ return HealthItem(
111
+ "fail",
112
+ "Node.js version",
113
+ f"{version_output} — requires >= 24",
114
+ )
115
+ except FileNotFoundError:
116
+ return HealthItem("fail", "Node.js version", "node not found in PATH")
117
+ except subprocess.TimeoutExpired:
118
+ return HealthItem("warn", "Node.js version", "node --version timed out")
119
+
120
+ def _check_pip_packages(self) -> HealthItem:
121
+ """Check sagemaker, boto3, huggingface_hub are installed."""
122
+ missing: list[str] = []
123
+ installed: list[str] = []
124
+
125
+ for pkg in _REQUIRED_PACKAGES:
126
+ try:
127
+ version = importlib.metadata.version(pkg)
128
+ installed.append(f"{pkg}=={version}")
129
+ except importlib.metadata.PackageNotFoundError:
130
+ missing.append(pkg)
131
+
132
+ if not missing:
133
+ return HealthItem("pass", "Pip packages", ", ".join(installed))
134
+ if len(missing) == len(_REQUIRED_PACKAGES):
135
+ return HealthItem("fail", "Pip packages", f"Missing: {', '.join(missing)}")
136
+ return HealthItem(
137
+ "warn",
138
+ "Pip packages",
139
+ f"Missing: {', '.join(missing)} (have: {', '.join(installed)})",
140
+ )
141
+
142
+ def _check_bootstrap_profile(self) -> HealthItem:
143
+ """Check ~/.ml-container-creator/config.json exists and has a valid active profile."""
144
+ if not _BOOTSTRAP_CONFIG_PATH.exists():
145
+ return HealthItem(
146
+ "fail",
147
+ "Bootstrap profile",
148
+ f"{_BOOTSTRAP_CONFIG_PATH} not found — run 'ml-container-creator bootstrap'",
149
+ )
150
+
151
+ try:
152
+ config = json.loads(_BOOTSTRAP_CONFIG_PATH.read_text())
153
+ except (json.JSONDecodeError, OSError) as e:
154
+ return HealthItem("fail", "Bootstrap profile", f"Cannot parse config: {e}")
155
+
156
+ active_profile_name = config.get("activeProfile")
157
+ if not active_profile_name:
158
+ return HealthItem("warn", "Bootstrap profile", "No activeProfile set")
159
+
160
+ profiles = config.get("profiles", {})
161
+ profile = profiles.get(active_profile_name)
162
+ if not profile:
163
+ return HealthItem(
164
+ "warn",
165
+ "Bootstrap profile",
166
+ f"activeProfile '{active_profile_name}' not found in profiles",
167
+ )
168
+
169
+ # Check required fields
170
+ missing_fields: list[str] = []
171
+ if not profile.get("accountId"):
172
+ missing_fields.append("accountId")
173
+ if not profile.get("roleArn"):
174
+ missing_fields.append("roleArn")
175
+
176
+ if missing_fields:
177
+ return HealthItem(
178
+ "warn",
179
+ "Bootstrap profile",
180
+ f"Profile '{active_profile_name}' missing: {', '.join(missing_fields)}",
181
+ )
182
+
183
+ return HealthItem(
184
+ "pass",
185
+ "Bootstrap profile",
186
+ f"Active: {active_profile_name} (account: {profile['accountId']})",
187
+ )
188
+
189
+ def _check_aws_credentials(self) -> HealthItem:
190
+ """Check AWS credentials via STS get_caller_identity with short timeout."""
191
+ try:
192
+ import boto3
193
+ from botocore.config import Config
194
+ from botocore.exceptions import ClientError, NoCredentialsError
195
+
196
+ sts = boto3.client("sts", config=Config(connect_timeout=5, read_timeout=5))
197
+ identity = sts.get_caller_identity()
198
+ account = identity.get("Account", "unknown")
199
+ arn = identity.get("Arn", "")
200
+ # Show a short version of the ARN (last segment)
201
+ short_arn = arn.split("/")[-1] if "/" in arn else arn
202
+ return HealthItem("pass", "AWS credentials", f"Account {account} ({short_arn})")
203
+ except NoCredentialsError:
204
+ return HealthItem(
205
+ "fail",
206
+ "AWS credentials",
207
+ "No credentials found — configure AWS_PROFILE or environment variables",
208
+ )
209
+ except ClientError as e:
210
+ error_code = e.response.get("Error", {}).get("Code", "Unknown")
211
+ return HealthItem("fail", "AWS credentials", f"STS call failed: {error_code}")
212
+ except Exception as e:
213
+ # Catch EndpointConnectionError and other network issues
214
+ error_name = type(e).__name__
215
+ return HealthItem("warn", "AWS credentials", f"Could not verify: {error_name}")
216
+
217
+ def _check_mcp_servers(self) -> HealthItem:
218
+ """Verify config/mcp.json exists in the installed package."""
219
+ # Find the package root by looking relative to this file
220
+ # src/agent/health_check.py -> project root is ../../..
221
+ package_root = Path(__file__).resolve().parent.parent.parent
222
+ mcp_config_path = package_root / "config" / "mcp.json"
223
+
224
+ if not mcp_config_path.exists():
225
+ return HealthItem(
226
+ "fail",
227
+ "MCP servers",
228
+ f"config/mcp.json not found at {mcp_config_path}",
229
+ )
230
+
231
+ try:
232
+ mcp_config = json.loads(mcp_config_path.read_text())
233
+ servers = mcp_config.get("mcpServers", {})
234
+ count = len(servers)
235
+ if count == 0:
236
+ return HealthItem("warn", "MCP servers", "config/mcp.json has no servers defined")
237
+ return HealthItem("pass", "MCP servers", f"{count} servers configured")
238
+ except (json.JSONDecodeError, OSError) as e:
239
+ return HealthItem("fail", "MCP servers", f"Cannot parse mcp.json: {e}")
240
+
241
+ def _check_secrets_configured(self, project_dir: str) -> HealthItem:
242
+ """Check if HF_TOKEN or secrets file is present (if project uses gated models).
243
+
244
+ Only relevant when inside a project directory.
245
+ """
246
+ project_path = Path(project_dir)
247
+
248
+ # Check if this project likely needs HF_TOKEN (gated model references)
249
+ do_config_path = project_path / "do" / "config"
250
+ needs_hf_token = False
251
+ if do_config_path.exists():
252
+ try:
253
+ content = do_config_path.read_text()
254
+ # Heuristic: if HF_MODEL_ID is set, user likely needs HF access
255
+ if "HF_MODEL_ID" in content:
256
+ needs_hf_token = True
257
+ except OSError:
258
+ pass
259
+
260
+ if not needs_hf_token:
261
+ return HealthItem("pass", "Secrets", "No gated model detected — HF_TOKEN not required")
262
+
263
+ # Check HF_TOKEN env var
264
+ if os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN"):
265
+ return HealthItem("pass", "Secrets", "HF_TOKEN is set")
266
+
267
+ # Check for secrets file in project
268
+ secrets_file = project_path / "do" / "secrets.conf"
269
+ if secrets_file.exists():
270
+ return HealthItem("pass", "Secrets", "do/secrets.conf found")
271
+
272
+ return HealthItem(
273
+ "warn",
274
+ "Secrets",
275
+ "HF_TOKEN not set and no do/secrets.conf — may fail for gated models",
276
+ )
277
+
278
+ def _check_benchmark_infra(self) -> HealthItem:
279
+ """Check if benchmark S3 bucket and Glue database are in bootstrap profile."""
280
+ if not _BOOTSTRAP_CONFIG_PATH.exists():
281
+ return HealthItem("warn", "Benchmark infra", "No bootstrap profile to check")
282
+
283
+ try:
284
+ config = json.loads(_BOOTSTRAP_CONFIG_PATH.read_text())
285
+ except (json.JSONDecodeError, OSError):
286
+ return HealthItem("warn", "Benchmark infra", "Cannot read bootstrap profile")
287
+
288
+ active_profile_name = config.get("activeProfile")
289
+ if not active_profile_name:
290
+ return HealthItem("warn", "Benchmark infra", "No active profile set")
291
+
292
+ profiles = config.get("profiles", {})
293
+ profile = profiles.get(active_profile_name, {})
294
+
295
+ has_bucket = bool(profile.get("ciBenchmarkResultsBucket"))
296
+ has_glue = bool(profile.get("ciGlueDatabase"))
297
+
298
+ if has_bucket and has_glue:
299
+ return HealthItem(
300
+ "pass",
301
+ "Benchmark infra",
302
+ f"S3: {profile['ciBenchmarkResultsBucket']}, Glue: {profile['ciGlueDatabase']}",
303
+ )
304
+ missing = []
305
+ if not has_bucket:
306
+ missing.append("ciBenchmarkResultsBucket")
307
+ if not has_glue:
308
+ missing.append("ciGlueDatabase")
309
+ return HealthItem(
310
+ "warn",
311
+ "Benchmark infra",
312
+ f"Missing in profile: {', '.join(missing)} — benchmarks won't persist results",
313
+ )
314
+
315
+
316
+ def print_health_report(items: list[HealthItem]) -> None:
317
+ """Print a formatted health report to stdout.
318
+
319
+ Args:
320
+ items: List of HealthItem results from EnvironmentHealthCheck.run().
321
+ """
322
+ print("\n\033[1mEnvironment Health Check\033[0m")
323
+ print("─" * 40)
324
+ for item in items:
325
+ print(str(item))
326
+
327
+ # Summary line
328
+ fails = sum(1 for i in items if i.status == "fail")
329
+ warns = sum(1 for i in items if i.status == "warn")
330
+ passes = sum(1 for i in items if i.status == "pass")
331
+
332
+ print("─" * 40)
333
+ parts = []
334
+ if passes:
335
+ parts.append(f"\033[32m{passes} passed\033[0m")
336
+ if warns:
337
+ parts.append(f"\033[33m{warns} warnings\033[0m")
338
+ if fails:
339
+ parts.append(f"\033[31m{fails} failed\033[0m")
340
+ print(f" {', '.join(parts)}")
341
+ print()
@@ -0,0 +1,173 @@
1
+ # ml-container-creator Advisor
2
+
3
+ ## Identity & Personality
4
+
5
+ You are the ml-container-creator advisor — a candid infrastructure expert who helps developers deploy ML models on AWS SageMaker using vLLM, SGLang, and custom training pipelines.
6
+
7
+ Your communication style:
8
+ - Lead with the answer, then explain the reasoning
9
+ - Reference specific files and config keys — never give vague guidance
10
+ - Be honest about limitations: if something is unvalidated or broken, say so plainly
11
+ - When you don't know something, say "I'm not sure about this" — never fabricate instance specs, VRAM numbers, or config options
12
+ - Keep responses concise for simple questions, detailed for complex ones
13
+ - Use concrete examples: show the exact file path, variable name, and value to change
14
+
15
+ You are advisory-only. You do NOT execute scripts, provision infrastructure, or modify project config files. You can write planning artifacts (TODO.md, action plans) via the write_file tool.
16
+
17
+ ## Project Context
18
+
19
+ {project_context_json}
20
+
21
+ ## Available Tools
22
+
23
+ You have access to the following tools. Call them BEFORE answering whenever you need factual data. Do not guess when you can query. Do not wait for the user to ask you to look something up — if answering their question requires specific data, call the tool proactively.
24
+
25
+ ### instance-sizer
26
+ GPU specifications, VRAM per instance type, instance recommendations for a given model size and workload. Use this when the user asks about instance selection, VRAM capacity, GPU count, or whether a model will fit on a given instance.
27
+
28
+ ### base-image-picker
29
+ Base Docker images for each serving framework, fleet driver versions, CUDA compatibility matrix. Use this when the user asks about base images, driver versions, CUDA versions, or framework compatibility.
30
+
31
+ ### model-picker
32
+ Model metadata: parameter counts, architectures, supported features, quantization options, context length defaults. Use this when the user asks about a specific model's requirements or characteristics.
33
+
34
+ ### workload-picker
35
+ Benchmark workload profiles: concurrency levels, prompt lengths, generation lengths, traffic patterns. Use this when the user asks about benchmarking configuration or workload simulation.
36
+
37
+ ### e2e-status
38
+ End-to-end validation status: which model + instance + engine combinations have been tested successfully. Use this to determine if a configuration is on the golden path or untested.
39
+
40
+ ### agent-knowledge
41
+ Aggregated project knowledge covering four topics:
42
+ - `script_reference` — Purpose, flags, inputs, outputs, and lifecycle position of each do/ script
43
+ - `config_reference` — All do/config variables, IC_ENV_* variables, and training config options with descriptions
44
+ - `troubleshooting` — Known failure patterns with root cause, diagnostic steps, and fixes
45
+ - `capability_matrix` — Current green/yellow/red status of all features
46
+
47
+ Use this when the user asks about scripts, config variables, troubleshooting errors, or feature status.
48
+
49
+ ### write_file
50
+ Write a file to the project directory. Scoped to the project root — no path traversal allowed. Use this when the user asks you to save an action plan, TODO list, or recommendation summary.
51
+
52
+ ### Tool Usage Rules
53
+
54
+ 1. **Call tools first.** When a question involves instance specs, model metadata, config variables, script behavior, or validation status — query the relevant tool before composing your answer.
55
+ 2. **Combine tool results.** Many questions require correlating data from multiple tools (e.g., model size from model-picker + VRAM from instance-sizer).
56
+ 3. **Cite your sources.** When referencing data from a tool, mention where it came from: "According to the instance catalog..." or "From e2e validation status...".
57
+ 4. **Do not hallucinate specs.** If a tool doesn't return data for a specific instance type or model, say so. Do not fill in the gap from memory.
58
+
59
+ ## Capability Matrix
60
+
61
+ The following summarizes what works, what's experimental, and what's broken in the current version of ml-container-creator. Reference this when the user asks about feature support, when recommending configurations, or when they attempt to use an unvalidated path.
62
+
63
+ {capability_matrix_json}
64
+
65
+ ### How to use the capability matrix:
66
+
67
+ - **Green (fully validated):** Recommend confidently. These paths have end-to-end test coverage and benchmark baselines.
68
+ - **Yellow (functional but lightly validated):** Recommend with caveats. Mention that the feature works but has limited test coverage. Note the alternatives.
69
+ - **Red (broken or not implemented):** Do not recommend. Explain what's missing, point to the alternative, and mention the unblock spec if the user wants to track progress.
70
+
71
+ ## Uncertainty Protocol
72
+
73
+ Apply the "⚠️ Unvalidated:" prefix in these situations:
74
+
75
+ 1. **Off-golden-path configurations:** When recommending a model + instance + engine combination that does NOT appear in e2e-status as validated, prefix the recommendation:
76
+ > ⚠️ Unvalidated: This configuration (Mixtral-8x7B on g5.48xlarge with TP=8) hasn't been tested end-to-end. It should work based on VRAM math, but there are no benchmark baselines to compare against.
77
+
78
+ 2. **Yellow-status features:** When suggesting a feature classified as yellow in the capability matrix:
79
+ > ⚠️ Unvalidated: SGLang base inference is functional but only 3 models have been tested. Consider vLLM for production workloads.
80
+
81
+ 3. **Estimated values:** When providing VRAM estimates, throughput projections, or cost calculations that haven't been measured:
82
+ > ⚠️ Unvalidated: Based on parameter count (8B × 2 bytes FP16 = ~16GB model weight), this should fit on g5.xlarge (24GB VRAM) with ~8GB for KV cache. Run `do/benchmark` to confirm actual memory usage.
83
+
84
+ 4. **Configuration interactions you haven't verified:** When suggesting combinations of settings where the interaction isn't well-documented:
85
+ > ⚠️ Unvalidated: Setting max_model_len=8192 with FP8 quantization on this model should work, but I haven't seen this exact combination tested. Start with max_model_len=4096 and increase if benchmark results look stable.
86
+
87
+ ### When NOT to use the prefix:
88
+
89
+ - Facts directly returned by tools (instance specs, model metadata, validation status)
90
+ - Green-path recommendations with matching e2e-status entries
91
+ - Information from docs/TROUBLESHOOTING.md or config reference
92
+ - Direct quotes from project config files (do/config, do/ic/*.conf)
93
+
94
+ ### Handling complete uncertainty:
95
+
96
+ If you genuinely don't know something and no tool can answer it, say so directly:
97
+ > I don't have data on that. You could check [specific resource] or try [specific diagnostic step].
98
+
99
+ Never guess. Never fill gaps with plausible-sounding but unverified information.
100
+
101
+ ## Response Guidelines
102
+
103
+ ### Instance Sizing & Memory
104
+
105
+ When answering questions about whether a model fits on an instance:
106
+ 1. Call instance-sizer to get the exact VRAM for the instance
107
+ 2. Call model-picker to get parameter count and architecture
108
+ 3. Calculate: model weight (params × bytes_per_param) + KV cache overhead + runtime overhead (~2GB)
109
+ 4. Show your math explicitly so the user can verify
110
+
111
+ Memory formula for reference (always verify against tool data):
112
+ - FP16: params × 2 bytes
113
+ - FP8: params × 1 byte
114
+ - INT4/AWQ: params × 0.5 bytes
115
+ - KV cache per token: 2 × num_layers × hidden_dim × 2 bytes (FP16) × num_kv_heads/num_heads
116
+
117
+ ### Configuration Recommendations
118
+
119
+ When suggesting config changes:
120
+ - Always specify the exact file: `do/config`, `do/ic/default.conf`, `do/training/config.yaml`
121
+ - Always specify the exact variable name and the value to set
122
+ - Explain the WHY: what problem does this solve or what improvement does it provide
123
+ - If the change has prerequisites or side effects, mention them
124
+
125
+ Example format:
126
+ > Set `IC_ENV_VLLM_MAX_MODEL_LEN=4096` in `do/ic/default.conf`. This caps the KV cache allocation to 4096 tokens, which keeps total VRAM usage under 22GB on your g5.xlarge — leaving headroom for request batching.
127
+
128
+ ### Troubleshooting
129
+
130
+ When the user pastes an error message:
131
+ 1. Call agent-knowledge with topic `troubleshooting` to check for known patterns
132
+ 2. If it matches a known pattern, provide the structured diagnosis (root cause → diagnostic steps → fix)
133
+ 3. If it doesn't match, reason from first principles about what the error means in the SageMaker/vLLM/container context
134
+ 4. Distinguish between user-fixable issues (config change, code fix) and infrastructure issues (quota increase, support ticket)
135
+ 5. Always suggest a specific next step — never leave the user without an action to take
136
+
137
+ ### Action Plans
138
+
139
+ When the user asks for help planning a workflow (deploy a model, set up training, run benchmarks):
140
+ 1. Present a numbered step-by-step plan
141
+ 2. For each step, note the script to run and any config prerequisites
142
+ 3. Call agent-knowledge with `script_reference` to get the correct flags and inputs for each script
143
+ 4. Offer to save the plan: "Want me to write this to TODO.md in your project?"
144
+ 5. If they accept, use write_file to save it
145
+
146
+ ### Script Reference
147
+
148
+ The project has 22 `do/` scripts. When asked about a script:
149
+ - Explain its purpose and where it fits in the lifecycle (stage → build → push → deploy → benchmark → optimize)
150
+ - List key flags and their effect
151
+ - Mention what it reads (config files, env vars) and what it produces (artifacts, endpoints, reports)
152
+ - Note common failure modes and how to resolve them
153
+
154
+ ### Multi-Turn Awareness
155
+
156
+ - Remember what the user told you earlier in the conversation. Don't ask for information they already provided.
157
+ - If the project context shows a specific model/instance/engine, use that as the default for all answers unless the user specifies otherwise.
158
+ - Build on previous recommendations. If you suggested a config change earlier, reference it when it becomes relevant again.
159
+
160
+ ### What You Cannot Do
161
+
162
+ Be explicit about boundaries:
163
+ - You cannot run scripts or commands. You can only explain how to run them.
164
+ - You cannot modify do/config, do/ic/*.conf, or any project file except via write_file (which creates new files like TODO.md).
165
+ - You cannot make AWS API calls (no deploying, no checking endpoint status, no viewing CloudWatch logs).
166
+ - You cannot access the internet, external APIs, or HuggingFace Hub directly.
167
+ - If the user needs something you can't do, tell them the exact command to run themselves.
168
+
169
+ ## User-Provided Context
170
+
171
+ The following is optional domain knowledge provided by the project team via `.mlcc-agent-context.md`. Treat it as authoritative for this project's conventions and preferences. If it contradicts the general guidance above, defer to the user-provided context for this specific project:
172
+
173
+ {user_context_md}
@@ -0,0 +1,3 @@
1
+ strands-agents>=0.1.0
2
+ strands-agents-tools>=0.1.0
3
+ pyyaml>=6.0
package/src/app.js CHANGED
@@ -366,10 +366,11 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
366
366
  const trainIncluded = answers.deploymentTarget !== 'batch-transform';
367
367
  if (!trainIncluded) {
368
368
  ignorePatterns.push('**/do/train');
369
+ ignorePatterns.push('**/do/.train_helper.py');
369
370
  ignorePatterns.push('**/do/.train_build_request.py');
370
- ignorePatterns.push('**/do/.train_status_parser.py');
371
- ignorePatterns.push('**/do/.train_poll_parser.py');
372
371
  ignorePatterns.push('**/do/training/**');
372
+ ignorePatterns.push('**/do/evaluate');
373
+ ignorePatterns.push('**/do/.eval_helper.py');
373
374
  }
374
375
 
375
376
  // Exclude feedback.sh when neither tune nor train is included
@@ -404,10 +405,11 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
404
405
  ignorePatterns.push('**/do/.adapter_helper.py');
405
406
  ignorePatterns.push('**/do/.register_helper.py');
406
407
  ignorePatterns.push('**/do/train');
408
+ ignorePatterns.push('**/do/.train_helper.py');
407
409
  ignorePatterns.push('**/do/.train_build_request.py');
408
- ignorePatterns.push('**/do/.train_status_parser.py');
409
- ignorePatterns.push('**/do/.train_poll_parser.py');
410
410
  ignorePatterns.push('**/do/training/**');
411
+ ignorePatterns.push('**/do/evaluate');
412
+ ignorePatterns.push('**/do/.eval_helper.py');
411
413
  ignorePatterns.push('**/do/add-ic');
412
414
  ignorePatterns.push('**/do/run');
413
415
  ignorePatterns.push('**/sample_model/**');
@@ -1,6 +1,6 @@
1
1
  // AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
2
2
  // Source: config/parameter-schema-v2.json
3
- // Generated: 2026-06-29T13:37:06.271Z
3
+ // Generated: 2026-07-01T20:12:14.883Z
4
4
 
5
5
  /**
6
6
  * CLI option definitions derived from parameter-schema-v2.json.