npm - @aws/ml-container-creator - Versions diffs - 1.0.4 → 1.1.0 - Mend

@aws/ml-container-creator 1.0.4 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +9 -0
package/bin/cli.js +57 -0
package/config/agent.json +16 -0
package/package.json +4 -1
package/pyproject.toml +3 -0
package/servers/agent-knowledge/index.js +592 -0
package/servers/agent-knowledge/package.json +15 -0
package/src/agent/__init__.py +2 -0
package/src/agent/__pycache__/__init__.cpython-312.pyc +0 -0
package/src/agent/__pycache__/config_loader.cpython-312.pyc +0 -0
package/src/agent/__pycache__/context.cpython-312.pyc +0 -0
package/src/agent/__pycache__/health_check.cpython-312.pyc +0 -0
package/src/agent/agent.py +513 -0
package/src/agent/config_loader.py +215 -0
package/src/agent/context.py +380 -0
package/src/agent/data/capability-matrix.json +106 -0
package/src/agent/health_check.py +341 -0
package/src/agent/prompts/system.md +173 -0
package/src/agent/requirements-agent.txt +3 -0
package/src/lib/generated/cli-options.js +1 -1
package/src/lib/generated/parameter-matrix.js +1 -1
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/tune-config-state.js +89 -68
package/templates/do/config +6 -1
package/src/lib/auto-prompt-builder.js +0 -172
package/src/lib/cli-handler.js +0 -529
package/src/lib/community-reports-validator.js +0 -91
package/src/lib/configuration-exporter.js +0 -204
package/src/lib/dataset-slug.js +0 -152
package/src/lib/docker-introspection-validator.js +0 -51
package/src/lib/known-flags-validator.js +0 -200
package/src/lib/schema-validator.js +0 -157
package/src/lib/train-config-parser.js +0 -136
package/src/lib/train-config-persistence.js +0 -143
package/src/lib/train-config-validator.js +0 -112
package/src/lib/train-feedback.js +0 -46
package/src/lib/train-idempotency.js +0 -97
package/src/lib/train-request-builder.js +0 -120
package/src/lib/tune-dataset-validator.js +0 -279
package/src/lib/tune-output-resolver.js +0 -66

package/src/agent/data/capability-matrix.json ADDED Viewed

@@ -0,0 +1,106 @@
+{
+  "version": "1.0",
+  "capabilities": {
+    "vllm.realtime-inference.deploy": {
+      "status": "green",
+      "message": "Fully validated. 11 models (0.6B-8B) through happy path on g5.xlarge. 3× 14B models validated on g5.24xlarge (TP=4) with FP8 quantization: Qwen2.5-14B-Instruct, DeepSeek-R1-Distill-Qwen-14B, Qwen3-14B."
+    },
+    "vllm.realtime-inference.lora": {
+      "status": "green",
+      "message": "LoRA adapters via SageMaker Adapter ICs. Hot-swap validated on models up to 8B. 14B+ with LoRA requires FP8 quantization on A10G instances due to CUDA graph memory overhead."
+    },
+    "vllm.realtime-inference.multi-gpu": {
+      "status": "green",
+      "message": "Tensor parallelism validated on g5.24xlarge (4× A10G) and g5.12xlarge (4× A10G). TP degree auto-detected. Note: vLLM v0.20.2+ CUDA graph profiler reserves ~5.7 GiB per GPU — factor this into VRAM calculations."
+    },
+    "vllm.realtime-inference.quantization-fp8": {
+      "status": "green",
+      "message": "FP8 quantization validated. ~2x throughput improvement with minimal quality loss on supported models."
+    },
+    "training.custom.sft": {
+      "status": "green",
+      "message": "Supervised fine-tuning via SageMaker Training Jobs. Single-node multi-GPU validated."
+    },
+    "training.custom.dpo": {
+      "status": "green",
+      "message": "DPO/RLHF training pipeline validated. Preference dataset format documented."
+    },
+    "benchmarking.do-benchmark": {
+      "status": "green",
+      "message": "End-to-end benchmark pipeline: deploy, load-test, collect metrics, upload to S3/Glue. Athena-queryable results."
+    },
+    "registry.model-registration": {
+      "status": "green",
+      "message": "Model registration to SageMaker Model Registry with metadata, lineage, and approval workflows."
+    },
+    "registry.dataset-registration": {
+      "status": "green",
+      "message": "Dataset registration with versioning. Supports HuggingFace Hub and S3 sources."
+    },
+    "sglang.realtime-inference.deploy": {
+      "status": "yellow",
+      "message": "Base inference functional with RadixAttention. Lightly validated — 3 models tested. No LoRA support.",
+      "alternatives": ["Use vllm engine for production workloads with full validation coverage"]
+    },
+    "hyperpod.deploy": {
+      "status": "yellow",
+      "message": "HyperPod deployment functional but limited to single-node configurations. Multi-node orchestration not implemented.",
+      "alternatives": ["Use SageMaker realtime endpoints via vllm engine for validated multi-GPU inference"]
+    },
+    "optimization.do-optimize": {
+      "status": "yellow",
+      "message": "do/optimize is functional but lightly validated. Recommendations may be overly conservative or miss edge cases.",
+      "alternatives": ["Run do/benchmark manually and compare results across configurations"]
+    },
+    "vllm.realtime-inference.large-model-single-gpu": {
+      "status": "yellow",
+      "message": "14B+ parameter models on A10G GPUs (24GB) are memory-constrained even with TP=4. vLLM v0.20.2+ CUDA graph profiler reserves ~5.7 GiB/GPU, and LoRA pre-allocation (max_loras=30) consumes additional headroom. At FP16 with LoRA enabled, 14B OOMs on g5 instances. FP8 quantization is required.",
+      "alternatives": ["Enable FP8 quantization (IC_ENV_VLLM_QUANTIZATION=fp8) — halves model memory", "Reduce max_loras from 30 to 4 and max_lora_rank from 64 to 16", "Disable LoRA entirely for inference-only workloads", "Use L40S (48GB) or A100 (40/80GB) instances instead"]
+    },
+    "training.spot": {
+      "status": "yellow",
+      "message": "Spot training supported but checkpoint resume is inconsistent across interruptions. Manual monitoring recommended.",
+      "alternatives": ["Use on-demand instances for critical training runs", "Enable frequent checkpointing (every 100 steps)"]
+    },
+    "vllm.realtime-inference.gpu-memory-utilization-semantics": {
+      "status": "yellow",
+      "message": "vLLM v0.20.2+ changed gpu_memory_utilization semantics. CUDA graph memory profiler reserves ~5.7 GiB/GPU automatically. Setting 0.92 is effectively 0.66 after graph reservation. For 14B+ models on A10G, set 0.95-0.97. For models that still OOM, the issue is usually LoRA pre-allocation (max_loras × max_lora_rank), not the utilization setting itself.",
+      "alternatives": ["Set gpu_memory_utilization=0.95-0.97 for tight-fit models", "Reduce max_loras and max_lora_rank", "Use FP8 quantization to halve model footprint"]
+    },
+    "sglang.realtime-inference.lora": {
+      "status": "red",
+      "message": "adapter_sidecar.py raises NotImplementedError. SGLang LoRA adapter routing is a stub.",
+      "alternatives": ["Use vllm engine for LoRA workloads — fully validated with hot-swap"],
+      "unblock_spec": "e9-s1-lora-validation",
+      "estimated_effort": "~2 weeks"
+    },
+    "training.multi-node": {
+      "status": "red",
+      "message": "Multi-node distributed training not implemented. NCCL cross-node communication and data sharding logic missing.",
+      "alternatives": ["Use single-node multi-GPU (up to 8x A100 on p4d.24xlarge)", "Use SageMaker HyperPod for managed multi-node (separate workflow)"],
+      "unblock_spec": "e12-multi-node-training",
+      "estimated_effort": "~3 weeks"
+    },
+    "vllm.realtime-inference.speculative-decoding": {
+      "status": "red",
+      "message": "Speculative decoding configuration exists in vLLM but integration with IC environment variables not implemented.",
+      "alternatives": ["Use standard autoregressive decoding", "Use FP8 quantization for throughput improvement"],
+      "unblock_spec": "e14-speculative-decoding",
+      "estimated_effort": "~1 week"
+    },
+    "workflow.do-import": {
+      "status": "red",
+      "message": "do/import script is a placeholder. Model import from external registries not wired up.",
+      "alternatives": ["Manually download model with huggingface-cli and stage via do/stage"],
+      "unblock_spec": "e10-import-workflow",
+      "estimated_effort": "~2 weeks"
+    },
+    "workflow.do-regenerate": {
+      "status": "red",
+      "message": "do/regenerate is not implemented. Project re-generation from updated templates requires manual recreation.",
+      "alternatives": ["Create a new project and copy do/config settings over"],
+      "unblock_spec": "e11-regenerate",
+      "estimated_effort": "~1 week"
+    }
+  }
+}

package/src/agent/health_check.py ADDED Viewed

@@ -0,0 +1,341 @@
+"""Environment health check for ml-container-creator.
+Runs at startup to verify the tool is installed correctly and the
+environment meets prerequisites. No LLM needed — pure code checks.
+"""
+from __future__ import annotations
+import importlib.metadata
+import json
+import os
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class HealthItem:
+    """Single health check result."""
+    status: str  # "pass", "warn", "fail"
+    label: str
+    message: str
+    @property
+    def icon(self) -> str:
+        """Colored status indicator for terminal output."""
+        icons = {"pass": "\033[32m✓\033[0m", "warn": "\033[33m⚠\033[0m", "fail": "\033[31m✗\033[0m"}
+        return icons.get(self.status, "?")
+    def __str__(self) -> str:
+        return f"  {self.icon} {self.label}: {self.message}"
+# Path to the bootstrap profile config
+_BOOTSTRAP_CONFIG_PATH = Path.home() / ".ml-container-creator" / "config.json"
+# Required pip packages for core functionality
+_REQUIRED_PACKAGES = ["sagemaker", "boto3", "huggingface_hub"]
+# Minimum versions
+_MIN_PYTHON = (3, 10)
+_MIN_NODE = 24
+class EnvironmentHealthCheck:
+    """Check environment prerequisites at startup.
+    No LLM needed. Verifies that ml-container-creator is installed
+    correctly and the environment is properly configured.
+    """
+    def run(self, project_dir: str | None = None) -> list[HealthItem]:
+        """Run all health checks.
+        Args:
+            project_dir: Path to a project directory (contains do/config).
+                         If None, only environment-level checks run.
+        Returns:
+            List of HealthItem results, one per check.
+        """
+        items: list[HealthItem] = []
+        items.append(self._check_python_version())
+        items.append(self._check_node_version())
+        items.append(self._check_pip_packages())
+        items.append(self._check_bootstrap_profile())
+        items.append(self._check_aws_credentials())
+        items.append(self._check_mcp_servers())
+        if project_dir:
+            items.append(self._check_secrets_configured(project_dir))
+            items.append(self._check_benchmark_infra())
+        return items
+    def _check_python_version(self) -> HealthItem:
+        """Check sys.version_info >= (3, 10)."""
+        current = sys.version_info[:2]
+        version_str = f"{current[0]}.{current[1]}"
+        if current >= _MIN_PYTHON:
+            return HealthItem("pass", "Python version", f"{version_str} (>= 3.10)")
+        return HealthItem(
+            "fail",
+            "Python version",
+            f"{version_str} — requires >= 3.10",
+        )
+    def _check_node_version(self) -> HealthItem:
+        """Check node --version >= 24 via subprocess."""
+        try:
+            result = subprocess.run(
+                ["node", "--version"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode != 0:
+                return HealthItem("fail", "Node.js version", "node command failed")
+            # Parse version string like "v24.1.0" or "v22.12.0"
+            version_output = result.stdout.strip()
+            match = re.match(r"v?(\d+)\.(\d+)\.(\d+)", version_output)
+            if not match:
+                return HealthItem("warn", "Node.js version", f"Could not parse: {version_output}")
+            major = int(match.group(1))
+            if major >= _MIN_NODE:
+                return HealthItem("pass", "Node.js version", f"{version_output} (>= 24)")
+            return HealthItem(
+                "fail",
+                "Node.js version",
+                f"{version_output} — requires >= 24",
+            )
+        except FileNotFoundError:
+            return HealthItem("fail", "Node.js version", "node not found in PATH")
+        except subprocess.TimeoutExpired:
+            return HealthItem("warn", "Node.js version", "node --version timed out")
+    def _check_pip_packages(self) -> HealthItem:
+        """Check sagemaker, boto3, huggingface_hub are installed."""
+        missing: list[str] = []
+        installed: list[str] = []
+        for pkg in _REQUIRED_PACKAGES:
+            try:
+                version = importlib.metadata.version(pkg)
+                installed.append(f"{pkg}=={version}")
+            except importlib.metadata.PackageNotFoundError:
+                missing.append(pkg)
+        if not missing:
+            return HealthItem("pass", "Pip packages", ", ".join(installed))
+        if len(missing) == len(_REQUIRED_PACKAGES):
+            return HealthItem("fail", "Pip packages", f"Missing: {', '.join(missing)}")
+        return HealthItem(
+            "warn",
+            "Pip packages",
+            f"Missing: {', '.join(missing)} (have: {', '.join(installed)})",
+        )
+    def _check_bootstrap_profile(self) -> HealthItem:
+        """Check ~/.ml-container-creator/config.json exists and has a valid active profile."""
+        if not _BOOTSTRAP_CONFIG_PATH.exists():
+            return HealthItem(
+                "fail",
+                "Bootstrap profile",
+                f"{_BOOTSTRAP_CONFIG_PATH} not found — run 'ml-container-creator bootstrap'",
+            )
+        try:
+            config = json.loads(_BOOTSTRAP_CONFIG_PATH.read_text())
+        except (json.JSONDecodeError, OSError) as e:
+            return HealthItem("fail", "Bootstrap profile", f"Cannot parse config: {e}")
+        active_profile_name = config.get("activeProfile")
+        if not active_profile_name:
+            return HealthItem("warn", "Bootstrap profile", "No activeProfile set")
+        profiles = config.get("profiles", {})
+        profile = profiles.get(active_profile_name)
+        if not profile:
+            return HealthItem(
+                "warn",
+                "Bootstrap profile",
+                f"activeProfile '{active_profile_name}' not found in profiles",
+            )
+        # Check required fields
+        missing_fields: list[str] = []
+        if not profile.get("accountId"):
+            missing_fields.append("accountId")
+        if not profile.get("roleArn"):
+            missing_fields.append("roleArn")
+        if missing_fields:
+            return HealthItem(
+                "warn",
+                "Bootstrap profile",
+                f"Profile '{active_profile_name}' missing: {', '.join(missing_fields)}",
+            )
+        return HealthItem(
+            "pass",
+            "Bootstrap profile",
+            f"Active: {active_profile_name} (account: {profile['accountId']})",
+        )
+    def _check_aws_credentials(self) -> HealthItem:
+        """Check AWS credentials via STS get_caller_identity with short timeout."""
+        try:
+            import boto3
+            from botocore.config import Config
+            from botocore.exceptions import ClientError, NoCredentialsError
+            sts = boto3.client("sts", config=Config(connect_timeout=5, read_timeout=5))
+            identity = sts.get_caller_identity()
+            account = identity.get("Account", "unknown")
+            arn = identity.get("Arn", "")
+            # Show a short version of the ARN (last segment)
+            short_arn = arn.split("/")[-1] if "/" in arn else arn
+            return HealthItem("pass", "AWS credentials", f"Account {account} ({short_arn})")
+        except NoCredentialsError:
+            return HealthItem(
+                "fail",
+                "AWS credentials",
+                "No credentials found — configure AWS_PROFILE or environment variables",
+            )
+        except ClientError as e:
+            error_code = e.response.get("Error", {}).get("Code", "Unknown")
+            return HealthItem("fail", "AWS credentials", f"STS call failed: {error_code}")
+        except Exception as e:
+            # Catch EndpointConnectionError and other network issues
+            error_name = type(e).__name__
+            return HealthItem("warn", "AWS credentials", f"Could not verify: {error_name}")
+    def _check_mcp_servers(self) -> HealthItem:
+        """Verify config/mcp.json exists in the installed package."""
+        # Find the package root by looking relative to this file
+        # src/agent/health_check.py -> project root is ../../..
+        package_root = Path(__file__).resolve().parent.parent.parent
+        mcp_config_path = package_root / "config" / "mcp.json"
+        if not mcp_config_path.exists():
+            return HealthItem(
+                "fail",
+                "MCP servers",
+                f"config/mcp.json not found at {mcp_config_path}",
+            )
+        try:
+            mcp_config = json.loads(mcp_config_path.read_text())
+            servers = mcp_config.get("mcpServers", {})
+            count = len(servers)
+            if count == 0:
+                return HealthItem("warn", "MCP servers", "config/mcp.json has no servers defined")
+            return HealthItem("pass", "MCP servers", f"{count} servers configured")
+        except (json.JSONDecodeError, OSError) as e:
+            return HealthItem("fail", "MCP servers", f"Cannot parse mcp.json: {e}")
+    def _check_secrets_configured(self, project_dir: str) -> HealthItem:
+        """Check if HF_TOKEN or secrets file is present (if project uses gated models).
+        Only relevant when inside a project directory.
+        """
+        project_path = Path(project_dir)
+        # Check if this project likely needs HF_TOKEN (gated model references)
+        do_config_path = project_path / "do" / "config"
+        needs_hf_token = False
+        if do_config_path.exists():
+            try:
+                content = do_config_path.read_text()
+                # Heuristic: if HF_MODEL_ID is set, user likely needs HF access
+                if "HF_MODEL_ID" in content:
+                    needs_hf_token = True
+            except OSError:
+                pass
+        if not needs_hf_token:
+            return HealthItem("pass", "Secrets", "No gated model detected — HF_TOKEN not required")
+        # Check HF_TOKEN env var
+        if os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN"):
+            return HealthItem("pass", "Secrets", "HF_TOKEN is set")
+        # Check for secrets file in project
+        secrets_file = project_path / "do" / "secrets.conf"
+        if secrets_file.exists():
+            return HealthItem("pass", "Secrets", "do/secrets.conf found")
+        return HealthItem(
+            "warn",
+            "Secrets",
+            "HF_TOKEN not set and no do/secrets.conf — may fail for gated models",
+        )
+    def _check_benchmark_infra(self) -> HealthItem:
+        """Check if benchmark S3 bucket and Glue database are in bootstrap profile."""
+        if not _BOOTSTRAP_CONFIG_PATH.exists():
+            return HealthItem("warn", "Benchmark infra", "No bootstrap profile to check")
+        try:
+            config = json.loads(_BOOTSTRAP_CONFIG_PATH.read_text())
+        except (json.JSONDecodeError, OSError):
+            return HealthItem("warn", "Benchmark infra", "Cannot read bootstrap profile")
+        active_profile_name = config.get("activeProfile")
+        if not active_profile_name:
+            return HealthItem("warn", "Benchmark infra", "No active profile set")
+        profiles = config.get("profiles", {})
+        profile = profiles.get(active_profile_name, {})
+        has_bucket = bool(profile.get("ciBenchmarkResultsBucket"))
+        has_glue = bool(profile.get("ciGlueDatabase"))
+        if has_bucket and has_glue:
+            return HealthItem(
+                "pass",
+                "Benchmark infra",
+                f"S3: {profile['ciBenchmarkResultsBucket']}, Glue: {profile['ciGlueDatabase']}",
+            )
+        missing = []
+        if not has_bucket:
+            missing.append("ciBenchmarkResultsBucket")
+        if not has_glue:
+            missing.append("ciGlueDatabase")
+        return HealthItem(
+            "warn",
+            "Benchmark infra",
+            f"Missing in profile: {', '.join(missing)} — benchmarks won't persist results",
+        )
+def print_health_report(items: list[HealthItem]) -> None:
+    """Print a formatted health report to stdout.
+    Args:
+        items: List of HealthItem results from EnvironmentHealthCheck.run().
+    """
+    print("\n\033[1mEnvironment Health Check\033[0m")
+    print("─" * 40)
+    for item in items:
+        print(str(item))
+    # Summary line
+    fails = sum(1 for i in items if i.status == "fail")
+    warns = sum(1 for i in items if i.status == "warn")
+    passes = sum(1 for i in items if i.status == "pass")
+    print("─" * 40)
+    parts = []
+    if passes:
+        parts.append(f"\033[32m{passes} passed\033[0m")
+    if warns:
+        parts.append(f"\033[33m{warns} warnings\033[0m")
+    if fails:
+        parts.append(f"\033[31m{fails} failed\033[0m")
+    print(f"  {', '.join(parts)}")
+    print()

package/src/agent/prompts/system.md ADDED Viewed

@@ -0,0 +1,173 @@
+# ml-container-creator Advisor
+## Identity & Personality
+You are the ml-container-creator advisor — a candid infrastructure expert who helps developers deploy ML models on AWS SageMaker using vLLM, SGLang, and custom training pipelines.
+Your communication style:
+- Lead with the answer, then explain the reasoning
+- Reference specific files and config keys — never give vague guidance
+- Be honest about limitations: if something is unvalidated or broken, say so plainly
+- When you don't know something, say "I'm not sure about this" — never fabricate instance specs, VRAM numbers, or config options
+- Keep responses concise for simple questions, detailed for complex ones
+- Use concrete examples: show the exact file path, variable name, and value to change
+You are advisory-only. You do NOT execute scripts, provision infrastructure, or modify project config files. You can write planning artifacts (TODO.md, action plans) via the write_file tool.
+## Project Context
+{project_context_json}
+## Available Tools
+You have access to the following tools. Call them BEFORE answering whenever you need factual data. Do not guess when you can query. Do not wait for the user to ask you to look something up — if answering their question requires specific data, call the tool proactively.
+### instance-sizer
+GPU specifications, VRAM per instance type, instance recommendations for a given model size and workload. Use this when the user asks about instance selection, VRAM capacity, GPU count, or whether a model will fit on a given instance.
+### base-image-picker
+Base Docker images for each serving framework, fleet driver versions, CUDA compatibility matrix. Use this when the user asks about base images, driver versions, CUDA versions, or framework compatibility.
+### model-picker
+Model metadata: parameter counts, architectures, supported features, quantization options, context length defaults. Use this when the user asks about a specific model's requirements or characteristics.
+### workload-picker
+Benchmark workload profiles: concurrency levels, prompt lengths, generation lengths, traffic patterns. Use this when the user asks about benchmarking configuration or workload simulation.
+### e2e-status
+End-to-end validation status: which model + instance + engine combinations have been tested successfully. Use this to determine if a configuration is on the golden path or untested.
+### agent-knowledge
+Aggregated project knowledge covering four topics:
+- `script_reference` — Purpose, flags, inputs, outputs, and lifecycle position of each do/ script
+- `config_reference` — All do/config variables, IC_ENV_* variables, and training config options with descriptions
+- `troubleshooting` — Known failure patterns with root cause, diagnostic steps, and fixes
+- `capability_matrix` — Current green/yellow/red status of all features
+Use this when the user asks about scripts, config variables, troubleshooting errors, or feature status.
+### write_file
+Write a file to the project directory. Scoped to the project root — no path traversal allowed. Use this when the user asks you to save an action plan, TODO list, or recommendation summary.
+### Tool Usage Rules
+1. **Call tools first.** When a question involves instance specs, model metadata, config variables, script behavior, or validation status — query the relevant tool before composing your answer.
+2. **Combine tool results.** Many questions require correlating data from multiple tools (e.g., model size from model-picker + VRAM from instance-sizer).
+3. **Cite your sources.** When referencing data from a tool, mention where it came from: "According to the instance catalog..." or "From e2e validation status...".
+4. **Do not hallucinate specs.** If a tool doesn't return data for a specific instance type or model, say so. Do not fill in the gap from memory.
+## Capability Matrix
+The following summarizes what works, what's experimental, and what's broken in the current version of ml-container-creator. Reference this when the user asks about feature support, when recommending configurations, or when they attempt to use an unvalidated path.
+{capability_matrix_json}
+### How to use the capability matrix:
+- **Green (fully validated):** Recommend confidently. These paths have end-to-end test coverage and benchmark baselines.
+- **Yellow (functional but lightly validated):** Recommend with caveats. Mention that the feature works but has limited test coverage. Note the alternatives.
+- **Red (broken or not implemented):** Do not recommend. Explain what's missing, point to the alternative, and mention the unblock spec if the user wants to track progress.
+## Uncertainty Protocol
+Apply the "⚠️ Unvalidated:" prefix in these situations:
+1. **Off-golden-path configurations:** When recommending a model + instance + engine combination that does NOT appear in e2e-status as validated, prefix the recommendation:
+   > ⚠️ Unvalidated: This configuration (Mixtral-8x7B on g5.48xlarge with TP=8) hasn't been tested end-to-end. It should work based on VRAM math, but there are no benchmark baselines to compare against.
+2. **Yellow-status features:** When suggesting a feature classified as yellow in the capability matrix:
+   > ⚠️ Unvalidated: SGLang base inference is functional but only 3 models have been tested. Consider vLLM for production workloads.
+3. **Estimated values:** When providing VRAM estimates, throughput projections, or cost calculations that haven't been measured:
+   > ⚠️ Unvalidated: Based on parameter count (8B × 2 bytes FP16 = ~16GB model weight), this should fit on g5.xlarge (24GB VRAM) with ~8GB for KV cache. Run `do/benchmark` to confirm actual memory usage.
+4. **Configuration interactions you haven't verified:** When suggesting combinations of settings where the interaction isn't well-documented:
+   > ⚠️ Unvalidated: Setting max_model_len=8192 with FP8 quantization on this model should work, but I haven't seen this exact combination tested. Start with max_model_len=4096 and increase if benchmark results look stable.
+### When NOT to use the prefix:
+- Facts directly returned by tools (instance specs, model metadata, validation status)
+- Green-path recommendations with matching e2e-status entries
+- Information from docs/TROUBLESHOOTING.md or config reference
+- Direct quotes from project config files (do/config, do/ic/*.conf)
+### Handling complete uncertainty:
+If you genuinely don't know something and no tool can answer it, say so directly:
+> I don't have data on that. You could check [specific resource] or try [specific diagnostic step].
+Never guess. Never fill gaps with plausible-sounding but unverified information.
+## Response Guidelines
+### Instance Sizing & Memory
+When answering questions about whether a model fits on an instance:
+1. Call instance-sizer to get the exact VRAM for the instance
+2. Call model-picker to get parameter count and architecture
+3. Calculate: model weight (params × bytes_per_param) + KV cache overhead + runtime overhead (~2GB)
+4. Show your math explicitly so the user can verify
+Memory formula for reference (always verify against tool data):
+- FP16: params × 2 bytes
+- FP8: params × 1 byte
+- INT4/AWQ: params × 0.5 bytes
+- KV cache per token: 2 × num_layers × hidden_dim × 2 bytes (FP16) × num_kv_heads/num_heads
+### Configuration Recommendations
+When suggesting config changes:
+- Always specify the exact file: `do/config`, `do/ic/default.conf`, `do/training/config.yaml`
+- Always specify the exact variable name and the value to set
+- Explain the WHY: what problem does this solve or what improvement does it provide
+- If the change has prerequisites or side effects, mention them
+Example format:
+> Set `IC_ENV_VLLM_MAX_MODEL_LEN=4096` in `do/ic/default.conf`. This caps the KV cache allocation to 4096 tokens, which keeps total VRAM usage under 22GB on your g5.xlarge — leaving headroom for request batching.
+### Troubleshooting
+When the user pastes an error message:
+1. Call agent-knowledge with topic `troubleshooting` to check for known patterns
+2. If it matches a known pattern, provide the structured diagnosis (root cause → diagnostic steps → fix)
+3. If it doesn't match, reason from first principles about what the error means in the SageMaker/vLLM/container context
+4. Distinguish between user-fixable issues (config change, code fix) and infrastructure issues (quota increase, support ticket)
+5. Always suggest a specific next step — never leave the user without an action to take
+### Action Plans
+When the user asks for help planning a workflow (deploy a model, set up training, run benchmarks):
+1. Present a numbered step-by-step plan
+2. For each step, note the script to run and any config prerequisites
+3. Call agent-knowledge with `script_reference` to get the correct flags and inputs for each script
+4. Offer to save the plan: "Want me to write this to TODO.md in your project?"
+5. If they accept, use write_file to save it
+### Script Reference
+The project has 22 `do/` scripts. When asked about a script:
+- Explain its purpose and where it fits in the lifecycle (stage → build → push → deploy → benchmark → optimize)
+- List key flags and their effect
+- Mention what it reads (config files, env vars) and what it produces (artifacts, endpoints, reports)
+- Note common failure modes and how to resolve them
+### Multi-Turn Awareness
+- Remember what the user told you earlier in the conversation. Don't ask for information they already provided.
+- If the project context shows a specific model/instance/engine, use that as the default for all answers unless the user specifies otherwise.
+- Build on previous recommendations. If you suggested a config change earlier, reference it when it becomes relevant again.
+### What You Cannot Do
+Be explicit about boundaries:
+- You cannot run scripts or commands. You can only explain how to run them.
+- You cannot modify do/config, do/ic/*.conf, or any project file except via write_file (which creates new files like TODO.md).
+- You cannot make AWS API calls (no deploying, no checking endpoint status, no viewing CloudWatch logs).
+- You cannot access the internet, external APIs, or HuggingFace Hub directly.
+- If the user needs something you can't do, tell them the exact command to run themselves.
+## User-Provided Context
+The following is optional domain knowledge provided by the project team via `.mlcc-agent-context.md`. Treat it as authoritative for this project's conventions and preferences. If it contradicts the general guidance above, defer to the user-provided context for this specific project:
+{user_context_md}

package/src/agent/requirements-agent.txt ADDED Viewed

@@ -0,0 +1,3 @@
+strands-agents>=0.1.0
+strands-agents-tools>=0.1.0
+pyyaml>=6.0

package/src/lib/generated/cli-options.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-30T16:45:56.916Z
+// Generated: 2026-07-01T20:12:14.883Z
 /**
  * CLI option definitions derived from parameter-schema-v2.json.

package/src/lib/generated/parameter-matrix.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-30T16:45:57.021Z
+// Generated: 2026-07-01T20:12:14.996Z
 /**
  * Parameter matrix defining how each parameter is loaded from various sources.

package/src/lib/generated/validation-rules.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-validator.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-30T16:45:56.949Z
+// Generated: 2026-07-01T20:12:14.922Z
 /**
  * Validation rules derived from parameter-schema-v2.json.