npm - @aws/ml-container-creator - Versions diffs - 1.0.3 → 1.1.0 - Mend

@aws/ml-container-creator 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

package/README.md +10 -1
package/bin/cli.js +57 -0
package/config/agent.json +16 -0
package/infra/ci-harness/lib/ci-harness-stack.ts +43 -0
package/package.json +5 -2
package/pyproject.toml +3 -0
package/servers/agent-knowledge/index.js +592 -0
package/servers/agent-knowledge/package.json +15 -0
package/servers/base-image-picker/index.js +65 -18
package/servers/instance-sizer/index.js +32 -0
package/servers/lib/catalogs/fleet-drivers.json +38 -0
package/servers/lib/catalogs/model-arch-support.json +51 -0
package/servers/lib/catalogs/model-servers.json +2842 -1730
package/servers/lib/schemas/image-catalog.schema.json +12 -0
package/src/agent/__init__.py +2 -0
package/src/agent/__pycache__/__init__.cpython-312.pyc +0 -0
package/src/agent/__pycache__/config_loader.cpython-312.pyc +0 -0
package/src/agent/__pycache__/context.cpython-312.pyc +0 -0
package/src/agent/__pycache__/health_check.cpython-312.pyc +0 -0
package/src/agent/agent.py +513 -0
package/src/agent/config_loader.py +215 -0
package/src/agent/context.py +380 -0
package/src/agent/data/capability-matrix.json +106 -0
package/src/agent/health_check.py +341 -0
package/src/agent/prompts/system.md +173 -0
package/src/agent/requirements-agent.txt +3 -0
package/src/app.js +6 -4
package/src/lib/generated/cli-options.js +1 -1
package/src/lib/generated/parameter-matrix.js +1 -1
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/mcp-query-runner.js +110 -3
package/src/lib/prompt-runner.js +66 -22
package/src/lib/template-variable-resolver.js +8 -0
package/src/lib/train-config-builder.js +339 -0
package/src/lib/tune-config-state.js +89 -68
package/templates/do/.benchmark_writer.py +3 -0
package/templates/do/.eval_helper.py +409 -0
package/templates/do/.register_helper.py +185 -11
package/templates/do/.train_build_request.py +102 -113
package/templates/do/.train_helper.py +433 -0
package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
package/templates/do/adapter +157 -0
package/templates/do/benchmark +60 -3
package/templates/do/config +6 -1
package/templates/do/deploy.d/managed-inference.ejs +83 -0
package/templates/do/evaluate +272 -0
package/templates/do/lib/resolve-instance.sh +155 -0
package/templates/do/register +5 -0
package/templates/do/test +1 -0
package/templates/do/train +879 -126
package/templates/do/training/config.yaml +83 -11
package/templates/do/training/dpo/accelerate_config.yaml +24 -0
package/templates/do/training/dpo/defaults.yaml +26 -0
package/templates/do/training/dpo/prompts.json +8 -0
package/templates/do/training/dpo/train.py +363 -0
package/templates/do/training/sft/accelerate_config.yaml +22 -0
package/templates/do/training/sft/defaults.yaml +18 -0
package/templates/do/training/sft/prompts.json +7 -0
package/templates/do/training/sft/train.py +310 -0
package/templates/do/tune +11 -2
package/src/lib/auto-prompt-builder.js +0 -172
package/src/lib/cli-handler.js +0 -529
package/src/lib/community-reports-validator.js +0 -91
package/src/lib/configuration-exporter.js +0 -204
package/src/lib/dataset-slug.js +0 -152
package/src/lib/docker-introspection-validator.js +0 -51
package/src/lib/known-flags-validator.js +0 -200
package/src/lib/schema-validator.js +0 -157
package/src/lib/train-config-parser.js +0 -136
package/src/lib/train-config-persistence.js +0 -143
package/src/lib/train-config-validator.js +0 -112
package/src/lib/train-feedback.js +0 -46
package/src/lib/train-idempotency.js +0 -97
package/src/lib/train-request-builder.js +0 -120
package/src/lib/tune-dataset-validator.js +0 -279
package/src/lib/tune-output-resolver.js +0 -66
package/templates/do/.train_poll_parser.py +0 -135
package/templates/do/.train_status_parser.py +0 -187
/package/templates/do/training/{train.py → custom/train.py} +0 -0

package/src/lib/tune-config-state.js CHANGED Viewed

@@ -2,65 +2,91 @@
 // SPDX-License-Identifier: Apache-2.0
 /**
- * Tune Config State Manager
+ * Tune Config State
  *
- * JavaScript module that mimics the bash _update_config_var() behavior
- * from do/tune for testing purposes. Manages config variables written
- * after job submission.
+ * Manages bash-style config files (do/config) that contain lines like:
+ *   export VAR_NAME="value"
+ *
+ * Provides read/write access for tuning job state variables.
  */
 import { readFileSync, writeFileSync } from 'node:fs';
 /**
- * Update or add a config variable in a do/config-style file.
- * Mimics the bash _update_config_var() function:
- * - If the variable exists (line starts with `export VAR_NAME=`), replace it
- * - Otherwise, append a new line
+ * Read a variable value from a bash config file.
+ * Looks for lines matching: export VAR_NAME="value", export VAR_NAME='value', or export VAR_NAME=value
  *
  * @param {string} configPath - Path to the config file
- * @param {string} varName - Variable name (e.g., TUNE_JOB_NAME_SFT)
- * @param {string} varValue - Variable value
+ * @param {string} varName - Variable name to read
+ * @returns {string|null} The unquoted value, or null if not found
  */
-export function updateConfigVar(configPath, varName, varValue) {
-    let content = readFileSync(configPath, 'utf8');
-    const pattern = new RegExp(`^export ${varName}=.*$`, 'm');
+export function readConfigVar(configPath, varName) {
+    const content = readFileSync(configPath, 'utf8');
+    const lines = content.split('\n');
-    if (pattern.test(content)) {
-        content = content.replace(pattern, `export ${varName}="${varValue}"`);
-    } else {
-        if (content.length > 0 && !content.endsWith('\n')) {
-            content += '\n';
+    for (const line of lines) {
+        const trimmed = line.trim();
+        const prefix = `export ${varName}=`;
+        if (trimmed.startsWith(prefix)) {
+            let value = trimmed.slice(prefix.length);
+            // Strip surrounding quotes (double or single)
+            if ((value.startsWith('"') && value.endsWith('"')) ||
+                (value.startsWith('\'') && value.endsWith('\''))) {
+                value = value.slice(1, -1);
+            }
+            return value;
         }
-        content += `export ${varName}="${varValue}"\n`;
     }
-    writeFileSync(configPath, content, 'utf8');
+    return null;
 }
 /**
- * Read a config variable from a do/config-style file.
+ * Write or update a variable in a bash config file.
+ * If the variable already exists, replaces that line.
+ * If not, appends the new export line.
  *
  * @param {string} configPath - Path to the config file
- * @param {string} varName - Variable name to read
- * @returns {string|null} The variable value, or null if not found
+ * @param {string} varName - Variable name to set
+ * @param {string} value - Value to assign
  */
-export function readConfigVar(configPath, varName) {
+export function updateConfigVar(configPath, varName, value) {
     const content = readFileSync(configPath, 'utf8');
-    const pattern = new RegExp(`^export ${varName}="([^"]*)"`, 'm');
-    const match = content.match(pattern);
-    return match ? match[1] : null;
+    const lines = content.split('\n');
+    const prefix = `export ${varName}=`;
+    const newLine = `export ${varName}="${value}"`;
+    let found = false;
+    for (let i = 0; i < lines.length; i++) {
+        if (lines[i].trim().startsWith(prefix)) {
+            lines[i] = newLine;
+            found = true;
+            break;
+        }
+    }
+    if (found) {
+        writeFileSync(configPath, lines.join('\n'), 'utf8');
+    } else {
+        // Append to end of file
+        let appendContent = content;
+        if (appendContent.length > 0 && !appendContent.endsWith('\n')) {
+            appendContent += '\n';
+        }
+        appendContent += `${newLine  }\n`;
+        writeFileSync(configPath, appendContent, 'utf8');
+    }
 }
 /**
- * Simulate the config writes that happen after a successful job submission.
- * This mirrors the behavior in do/tune's _submit_job() function.
+ * Write tuning job submission state to config.
  *
  * @param {string} configPath - Path to the config file
- * @param {object} params - Submission parameters
- * @param {string} params.technique - Technique (sft, dpo, rlaif, rlvr)
- * @param {string} params.trainingType - Training type (lora, full-rank)
- * @param {string} params.datasetPath - Dataset path (s3://... or hf://...)
- * @param {string} params.jobName - Generated job name
+ * @param {object} state - Submission state
+ * @param {string} state.technique - Tuning technique (e.g., 'sft', 'dpo')
+ * @param {string} state.trainingType - Training type (e.g., 'lora', 'full-rank')
+ * @param {string} state.datasetPath - Dataset path (S3 or HF URI)
+ * @param {string} state.jobName - Generated job name
  */
 export function persistSubmissionState(configPath, { technique, trainingType, datasetPath, jobName }) {
     const techniqueUpper = technique.toUpperCase();
@@ -71,59 +97,54 @@ export function persistSubmissionState(configPath, { technique, trainingType, da
 }
 /**
- * Simulate the config writes that happen after a job completes successfully.
- * This mirrors the behavior in do/tune's _handle_completion() function.
- *
- * Writes three levels of tracking (AC-4.1, AC-4.2):
- * - Level 1: TUNE_OUTPUT_PATH_LATEST (always the last run, any technique)
- * - Level 2: TUNE_ADAPTER_PATH_<TECHNIQUE> (last run per technique)
- * - Level 3: TUNE_ADAPTER_PATH_<TECHNIQUE>_<SLUG> (per technique + dataset slug)
+ * Write tuning job completion state to config.
  *
  * @param {string} configPath - Path to the config file
- * @param {object} params - Completion parameters
- * @param {string} params.technique - Technique (sft, dpo, rlaif, rlvr)
- * @param {string} params.trainingType - Training type (lora, full-rank)
- * @param {string} params.artifactPath - S3 path to the output artifact
- * @param {string} params.outputType - Output type (adapter, full-model)
- * @param {string} [params.datasetSlug] - Optional dataset slug for per-technique-per-dataset tracking
+ * @param {object} state - Completion state
+ * @param {string} state.technique - Tuning technique
+ * @param {string} state.trainingType - Training type
+ * @param {string} state.artifactPath - Output artifact path (S3 URI)
+ * @param {string} state.outputType - Output type ('adapter' or 'model')
+ * @param {string} [state.datasetSlug] - Dataset slug for named paths
  */
-export function persistCompletionState(configPath, { technique, trainingType, artifactPath, outputType, datasetSlug }) {
+export function persistCompletionState(configPath, { technique, trainingType: _trainingType, artifactPath, outputType, datasetSlug }) {
     const techniqueUpper = technique.toUpperCase();
-    if (trainingType === 'lora') {
-        // Level 2: per-technique
+    updateConfigVar(configPath, 'TUNE_OUTPUT_PATH_LATEST', artifactPath);
+    updateConfigVar(configPath, 'TUNE_OUTPUT_TYPE_LATEST', outputType);
+    if (outputType === 'adapter') {
         updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}`, artifactPath);
-        // Level 3: per-technique + per-dataset (if slug available)
         if (datasetSlug) {
             const slugUpper = datasetSlug.toUpperCase().replace(/-/g, '_');
             updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}_${slugUpper}`, artifactPath);
         }
-    } else if (trainingType === 'full-rank') {
+    } else {
         updateConfigVar(configPath, `TUNE_MODEL_PATH_${techniqueUpper}`, artifactPath);
     }
-    // Level 1: latest
-    updateConfigVar(configPath, 'TUNE_OUTPUT_PATH_LATEST', artifactPath);
-    updateConfigVar(configPath, 'TUNE_OUTPUT_TYPE_LATEST', outputType);
 }
 /**
- * Generate a job name following the pattern used by do/tune.
- * Pattern: ${projectName}-tune-${technique}-YYYYMMDD-HHMMSS
+ * Generate a job name matching pattern: ${projectName}-tune-${technique}-YYYYMMDD-HHMMSS
+ * Uses local time for the timestamp.
  *
  * @param {string} projectName - Project name
- * @param {string} technique - Technique (sft, dpo, rlaif, rlvr)
- * @param {Date} [timestamp] - Optional timestamp (defaults to now)
- * @returns {string} Generated job name
+ * @param {string} technique - Tuning technique
+ * @param {Date} [timestamp] - Optional timestamp (defaults to new Date())
+ * @returns {string} Formatted job name
  */
-export function generateJobName(projectName, technique, timestamp = new Date()) {
-    const year = timestamp.getFullYear().toString();
-    const month = (timestamp.getMonth() + 1).toString().padStart(2, '0');
-    const day = timestamp.getDate().toString().padStart(2, '0');
-    const hours = timestamp.getHours().toString().padStart(2, '0');
-    const minutes = timestamp.getMinutes().toString().padStart(2, '0');
-    const seconds = timestamp.getSeconds().toString().padStart(2, '0');
+export function generateJobName(projectName, technique, timestamp) {
+    const ts = timestamp || new Date();
+    const year = ts.getFullYear().toString();
+    const month = (ts.getMonth() + 1).toString().padStart(2, '0');
+    const day = ts.getDate().toString().padStart(2, '0');
+    const hours = ts.getHours().toString().padStart(2, '0');
+    const minutes = ts.getMinutes().toString().padStart(2, '0');
+    const seconds = ts.getSeconds().toString().padStart(2, '0');
     const dateStr = `${year}${month}${day}`;
     const timeStr = `${hours}${minutes}${seconds}`;
     return `${projectName}-tune-${technique}-${dateStr}-${timeStr}`;
 }

package/templates/do/.benchmark_writer.py CHANGED Viewed

@@ -1478,6 +1478,7 @@ def _load_config_file(config_path):
                     'HF_MODEL_ID': 'hf_model_id',
                     'INSTANCE_TYPE': 'instance_type',
                     'INSTANCE_POOLS': 'instance_pools',
+                    'DEPLOYED_INSTANCE_TYPE': 'deployed_instance_type',
                     'BENCHMARK_INSTANCE_TYPE': 'benchmark_instance_type',
                     'DEPLOYMENT_CONFIG': 'deployment_config',
                     'DEPLOYMENT_TARGET': 'deployment_target',
@@ -1521,6 +1522,8 @@ def _load_config_file(config_path):
     #   BENCHMARK_INSTANCE_TYPE (live-resolved, persisted by do/benchmark) > INSTANCE_TYPE > INSTANCE_POOLS fallback
     if context.get('benchmark_instance_type'):
         context['instance_type'] = context.pop('benchmark_instance_type')
+    elif context.get('deployed_instance_type'):
+        context['instance_type'] = context.pop('deployed_instance_type')
     # Fall back to INSTANCE_POOLS when neither is set.
     # Heterogeneous pool configs may not have a standalone INSTANCE_TYPE value
     # but always define INSTANCE_POOLS as a JSON array with Priority fields.

package/templates/do/.eval_helper.py ADDED Viewed

@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Model Quality Evaluation Helper.
+Subcommands:
+    evaluate   - Run evaluation against deployed endpoint, compute metrics
+    eval-write - Write evaluation results to S3/Athena (Parquet)
+All output is JSON on stdout for bash consumption.
+"""
+import argparse
+import json
+import math
+import os
+import sys
+import time
+# ── Utility functions ─────────────────────────────────────────────────────────
+def _error_exit(message):
+    """Print JSON error to stdout and exit."""
+    print(json.dumps({"error": True, "message": message}))
+    sys.exit(1)
+def _output(data):
+    """Print JSON result to stdout."""
+    print(json.dumps(data))
+    sys.exit(0)
+# ── Endpoint invocation ───────────────────────────────────────────────────────
+def _invoke_endpoint(endpoint_name, ic_name, region, payload):
+    """Invoke SageMaker endpoint via boto3 runtime.
+    Uses InvokeEndpoint with InferenceComponentName header for IC routing.
+    Payload should be an OpenAI-compatible chat completion request.
+    Returns: parsed JSON response dict
+    """
+    import boto3
+    client = boto3.client('sagemaker-runtime', region_name=region)
+    kwargs = {
+        'EndpointName': endpoint_name,
+        'ContentType': 'application/json',
+        'Body': json.dumps(payload),
+    }
+    if ic_name:
+        kwargs['InferenceComponentName'] = ic_name
+    try:
+        response = client.invoke_endpoint(**kwargs)
+        body = response['Body'].read().decode('utf-8')
+        return json.loads(body)
+    except Exception as e:
+        return {"error": str(e)}
+def _score_text(endpoint_name, ic_name, region, prompt, completion):
+    """Score a completion by getting its logprobs via the endpoint.
+    Sends prompt + completion and requests logprobs for the completion tokens.
+    Returns sum of token logprobs, or None if logprobs unavailable.
+    """
+    messages = [
+        {"role": "user", "content": prompt},
+        {"role": "assistant", "content": completion},
+    ]
+    payload = {
+        "messages": messages,
+        "max_tokens": 1,
+        "temperature": 0.0,
+        "logprobs": True,
+        "top_logprobs": 1,
+    }
+    response = _invoke_endpoint(endpoint_name, ic_name, region, payload)
+    if "error" in response:
+        return None
+    # Extract logprobs from response
+    try:
+        choices = response.get("choices", [])
+        if not choices:
+            return None
+        # For scoring, we need the logprobs of the completion tokens
+        # The response format varies — try OpenAI-compatible format
+        logprobs_data = choices[0].get("logprobs")
+        if logprobs_data and "content" in logprobs_data:
+            token_logprobs = [t.get("logprob", 0.0) for t in logprobs_data["content"]]
+            return sum(token_logprobs) if token_logprobs else None
+        return None
+    except (KeyError, TypeError, IndexError):
+        return None
+def _generate_response(endpoint_name, ic_name, region, prompt, max_tokens=256):
+    """Generate a response from the endpoint for generation-based metrics.
+    Returns: generated text string, or None on failure.
+    """
+    payload = {
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+    }
+    response = _invoke_endpoint(endpoint_name, ic_name, region, payload)
+    if "error" in response:
+        return None
+    try:
+        choices = response.get("choices", [])
+        if choices:
+            return choices[0].get("message", {}).get("content", "")
+        return None
+    except (KeyError, TypeError, IndexError):
+        return None
+# ── Metric computation ────────────────────────────────────────────────────────
+def _compute_sft_metrics(endpoint_name, ic_name, region, dataset, samples):
+    """Compute SFT evaluation metrics.
+    Metrics: perplexity (via logprobs), avg_response_length, format_compliance, exact_match
+    """
+    metrics = {}
+    logprob_scores = []
+    response_lengths = []
+    exact_matches = 0
+    total = 0
+    for i, record in enumerate(dataset):
+        if samples and i >= samples:
+            break
+        prompt = record.get("prompt", "")
+        reference = record.get("reference", "")
+        if not prompt:
+            continue
+        total += 1
+        # Score via logprobs (for perplexity)
+        if reference:
+            score = _score_text(endpoint_name, ic_name, region, prompt, reference)
+            if score is not None:
+                # Approximate per-token logprob
+                # score is sum of logprobs; we need per-token average
+                # Estimate token count from character length (rough: 4 chars/token)
+                est_tokens = max(1, len(reference) // 4)
+                logprob_scores.append(score / est_tokens)
+        # Generate response (for length and exact match)
+        generated = _generate_response(endpoint_name, ic_name, region, prompt)
+        if generated is not None:
+            response_lengths.append(len(generated.split()))
+            if reference and generated.strip() == reference.strip():
+                exact_matches += 1
+    # Compute aggregate metrics
+    if logprob_scores:
+        avg_logprob = sum(logprob_scores) / len(logprob_scores)
+        metrics["perplexity"] = round(math.exp(-avg_logprob), 4)
+    if response_lengths:
+        metrics["avg_response_length"] = round(sum(response_lengths) / len(response_lengths), 1)
+    if total > 0:
+        metrics["exact_match_accuracy"] = round(exact_matches / total, 4)
+    metrics["samples_scored"] = total
+    return metrics
+def _compute_dpo_metrics(endpoint_name, ic_name, region, dataset, samples):
+    """Compute DPO evaluation metrics.
+    Metrics: reward_accuracy, avg_chosen_logprob, avg_rejected_logprob, reward_margin
+    """
+    metrics = {}
+    chosen_scores = []
+    rejected_scores = []
+    reward_correct = 0
+    total = 0
+    for i, record in enumerate(dataset):
+        if samples and i >= samples:
+            break
+        prompt = record.get("prompt", "")
+        chosen = record.get("chosen", "")
+        rejected = record.get("rejected", "")
+        if not prompt or not chosen or not rejected:
+            continue
+        total += 1
+        # Score chosen
+        chosen_score = _score_text(endpoint_name, ic_name, region, prompt, chosen)
+        # Score rejected
+        rejected_score = _score_text(endpoint_name, ic_name, region, prompt, rejected)
+        if chosen_score is not None and rejected_score is not None:
+            chosen_scores.append(chosen_score)
+            rejected_scores.append(rejected_score)
+            if chosen_score > rejected_score:
+                reward_correct += 1
+    # Compute aggregate metrics
+    scored = len(chosen_scores)
+    if scored > 0:
+        metrics["reward_accuracy"] = round(reward_correct / scored, 4)
+        metrics["avg_chosen_logprob"] = round(sum(chosen_scores) / scored, 4)
+        metrics["avg_rejected_logprob"] = round(sum(rejected_scores) / scored, 4)
+        metrics["reward_margin"] = round(
+            (sum(chosen_scores) - sum(rejected_scores)) / scored, 4
+        )
+    metrics["pairs_scored"] = scored
+    metrics["samples_evaluated"] = total
+    return metrics
+# ── Dataset loading ───────────────────────────────────────────────────────────
+def _load_eval_dataset(eval_dataset_path):
+    """Load evaluation dataset from local JSONL file or S3.
+    For this MVP, expects a local JSONL file path.
+    S3 and HF resolution is handled by the bash wrapper.
+    Returns: list of dicts
+    """
+    records = []
+    if not eval_dataset_path:
+        _error_exit("No evaluation dataset specified. Use --eval-dataset <path>")
+    # Handle S3 paths by downloading
+    if eval_dataset_path.startswith("s3://"):
+        import boto3
+        import tempfile
+        s3 = boto3.client('s3')
+        bucket = eval_dataset_path.split('/')[2]
+        key = '/'.join(eval_dataset_path.split('/')[3:])
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jsonl')
+        s3.download_file(bucket, key, tmp.name)
+        eval_dataset_path = tmp.name
+    # Load JSONL
+    try:
+        with open(eval_dataset_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    records.append(json.loads(line))
+    except (IOError, json.JSONDecodeError) as e:
+        _error_exit(f"Failed to load eval dataset: {e}")
+    if not records:
+        _error_exit("Evaluation dataset is empty")
+    return records
+# ── cmd_evaluate ──────────────────────────────────────────────────────────────
+def cmd_evaluate(args):
+    """Run evaluation against deployed endpoint.
+    Returns JSON with metrics and metadata.
+    """
+    endpoint_name = args.endpoint_name
+    ic_name = args.ic_name
+    region = args.region or os.environ.get('AWS_DEFAULT_REGION', 'us-east-1')
+    technique = args.technique or ''
+    samples = int(args.samples) if args.samples else None
+    # Load eval dataset
+    dataset = _load_eval_dataset(args.eval_dataset)
+    # Determine technique and compute metrics
+    if technique.lower() == 'dpo':
+        metrics = _compute_dpo_metrics(endpoint_name, ic_name, region, dataset, samples)
+    else:
+        # Default to SFT metrics (works for any technique)
+        metrics = _compute_sft_metrics(endpoint_name, ic_name, region, dataset, samples)
+    # Build result
+    result = {
+        "adapter_name": args.ic_name,
+        "technique": technique or "sft",
+        "model": os.environ.get("MODEL_NAME", ""),
+        "eval_dataset": args.eval_dataset or "",
+        "samples_evaluated": metrics.get("samples_evaluated", metrics.get("samples_scored", 0)),
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "metrics": metrics,
+    }
+    _output(result)
+# ── cmd_eval_write ────────────────────────────────────────────────────────────
+def cmd_eval_write(args):
+    """Write evaluation results to S3 as Parquet for Athena.
+    Reads a results JSON file and converts to Parquet format.
+    """
+    results_file = args.results_file
+    bucket = args.bucket
+    region = args.region or os.environ.get('AWS_DEFAULT_REGION', 'us-east-1')
+    # Read results
+    try:
+        with open(results_file, 'r') as f:
+            data = json.load(f)
+    except (IOError, json.JSONDecodeError) as e:
+        _error_exit(f"Failed to read results file: {e}")
+    adapter_name = data.get("adapter_name", "unknown")
+    technique = data.get("technique", "unknown")
+    model = data.get("model", "unknown")
+    timestamp = data.get("timestamp", time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
+    # Build Parquet record
+    record = {
+        "project_name": os.environ.get("PROJECT_NAME", ""),
+        "model_name": model,
+        "adapter_name": adapter_name,
+        "technique": technique,
+        "eval_dataset": data.get("eval_dataset", ""),
+        "samples_evaluated": data.get("samples_evaluated", 0),
+        "metrics": json.dumps(data.get("metrics", {})),
+        "timestamp": timestamp,
+        "region": region,
+    }
+    # Write as JSON lines (Athena can read JSON as well as Parquet)
+    # For MVP, write as JSON lines to S3. Parquet requires pyarrow dep.
+    s3_key = f"evaluations/model={model}/adapter={adapter_name}/{timestamp.replace(':', '-')}.json"
+    s3_uri = f"s3://{bucket}/{s3_key}"
+    try:
+        import boto3
+        s3 = boto3.client('s3', region_name=region)
+        s3.put_object(
+            Bucket=bucket,
+            Key=s3_key,
+            Body=json.dumps(record),
+            ContentType='application/json',
+        )
+        _output({"written": True, "s3_uri": s3_uri})
+    except Exception as e:
+        _error_exit(f"Failed to write to S3: {e}")
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(description='Model Quality Evaluation Helper')
+    subparsers = parser.add_subparsers(dest='command', required=True)
+    # evaluate
+    eval_parser = subparsers.add_parser('evaluate', help='Run evaluation')
+    eval_parser.add_argument('--endpoint-name', required=True)
+    eval_parser.add_argument('--ic-name', required=True)
+    eval_parser.add_argument('--region')
+    eval_parser.add_argument('--technique', default='')
+    eval_parser.add_argument('--eval-dataset', default='')
+    eval_parser.add_argument('--samples', default='')
+    eval_parser.add_argument('--metrics', default='')
+    # eval-write
+    write_parser = subparsers.add_parser('eval-write', help='Write results to S3')
+    write_parser.add_argument('--results-file', required=True)
+    write_parser.add_argument('--bucket', required=True)
+    write_parser.add_argument('--region')
+    args = parser.parse_args()
+    if args.command == 'evaluate':
+        cmd_evaluate(args)
+    elif args.command == 'eval-write':
+        cmd_eval_write(args)
+    else:
+        _error_exit(f"Unknown command: {args.command}")
+if __name__ == '__main__':
+    main()