npm - @aws/ml-container-creator - Versions diffs - 0.15.0 → 1.0.0 - Mend

@aws/ml-container-creator 0.15.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/config/parameter-schema-v2.json +1 -1
package/package.json +1 -1
package/servers/endpoint-picker/index.js +23 -14
package/src/lib/generated/cli-options.js +1 -1
package/src/lib/generated/parameter-matrix.js +2 -2
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/mcp-client.js +15 -1
package/src/lib/mcp-query-runner.js +5 -1
package/src/lib/prompt-runner.js +35 -20
package/templates/do/.benchmark_writer.py +34 -0
package/templates/do/.register_helper.py +63 -41
package/templates/do/benchmark +14 -9
package/templates/do/register +8 -3
package/templates/do/tune +19 -1

package/config/parameter-schema-v2.json CHANGED Viewed

@@ -893,7 +893,7 @@
                 ]
             },
             "widget": null,
-            "prompt": null,
+            "prompt": "external",
             "deprecated": false,
             "since": "0.3.0"
         },

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aws/ml-container-creator",
-  "version": "0.15.0",
+  "version": "1.0.0",
   "description": "Build and deploy custom ML containers on AWS SageMaker with minimal configuration.",
   "main": "src/index.js",
   "bin": {

package/servers/endpoint-picker/index.js CHANGED Viewed

@@ -78,6 +78,7 @@ function getGpusForInstance(instanceType) {
 let _SageMakerClient = null;
 let _ListEndpointsCommand = null;
 let _DescribeEndpointCommand = null;
+let _DescribeEndpointConfigCommand = null;
 let _ListInferenceComponentsCommand = null;
 let _fromIni = null;
@@ -90,6 +91,7 @@ async function _ensureSdkLoaded() {
     _SageMakerClient = sdk.SageMakerClient;
     _ListEndpointsCommand = sdk.ListEndpointsCommand;
     _DescribeEndpointCommand = sdk.DescribeEndpointCommand;
+    _DescribeEndpointConfigCommand = sdk.DescribeEndpointConfigCommand;
     _ListInferenceComponentsCommand = sdk.ListInferenceComponentsCommand;
     try {
         const credentialProviders = await import('@aws-sdk/credential-providers');
@@ -197,9 +199,24 @@ async function fetchEndpoints(client, { limit = 10, showFull = false } = {}) {
             const primaryVariant = variants[0] || {};
             const variantName = primaryVariant.VariantName || 'AllTraffic';
-            const instanceType = primaryVariant.CurrentInstanceCount !== null && primaryVariant.CurrentInstanceCount !== undefined
-                ? (primaryVariant.InstanceType || detail.ProductionVariants?.[0]?.InstanceType || 'unknown')
-                : (primaryVariant.InstanceType || 'unknown');
+            let instanceType = primaryVariant.InstanceType || null;
+            // For IC-based endpoints, InstanceType may not be in the variant runtime response.
+            // Fall back to DescribeEndpointConfig which always has it.
+            if (!instanceType && detail.EndpointConfigName) {
+                try {
+                    const ecCmd = new _DescribeEndpointConfigCommand({ EndpointConfigName: detail.EndpointConfigName });
+                    const ecDetail = await client.send(ecCmd);
+                    const ecVariant = (ecDetail.ProductionVariants || [])[0];
+                    if (ecVariant?.InstanceType) {
+                        instanceType = ecVariant.InstanceType;
+                    }
+                } catch (ecErr) {
+                    log(`Warning: could not describe endpoint config for "${endpointName}": ${ecErr.message}`);
+                }
+            }
+            instanceType = instanceType || 'unknown';
             const instanceCount = primaryVariant.CurrentInstanceCount ?? primaryVariant.DesiredInstanceCount ?? 1;
             const hasInstancePools = !!(primaryVariant.InstancePools && primaryVariant.InstancePools.length > 0);
@@ -387,17 +404,9 @@ server.tool(
         limit: z.number().int().positive().default(10).describe('Maximum number of endpoints to return'),
         context: z.record(z.string(), z.any()).optional().describe('Current configuration context (awsRegion, awsProfile, deploymentTarget)')
     },
-    async ({ parameters, limit, context }) => {
-        // Only respond if parameters includes endpointName AND context.deploymentTarget is realtime-inference
-        if (!parameters.includes('endpointName')) {
-            return {
-                content: [{
-                    type: 'text',
-                    text: JSON.stringify({ values: {}, choices: {} })
-                }]
-            };
-        }
+    async ({ parameters: _parameters, limit, context }) => {
+        // Only respond if context.deploymentTarget is realtime-inference
+        // Note: parameters may be empty when called on-demand via queryMcpServer()
         if (context?.deploymentTarget && context.deploymentTarget !== 'realtime-inference') {
             return {
                 content: [{

package/src/lib/generated/cli-options.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-22T13:49:00.815Z
+// Generated: 2026-06-23T20:55:23.381Z
 /**
  * CLI option definitions derived from parameter-schema-v2.json.

package/src/lib/generated/parameter-matrix.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-22T13:49:00.924Z
+// Generated: 2026-06-23T20:55:23.482Z
 /**
  * Parameter matrix defining how each parameter is loaded from various sources.
@@ -225,7 +225,7 @@ export const parameterMatrix = {
         'configFile': true,
         'packageJson': false,
         'mcp': true,
-        'promptable': false,
+        'promptable': true,
         'required': false,
         'default': null,
         'valueSpace': 'unbounded'

package/src/lib/generated/validation-rules.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-validator.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-22T13:49:00.849Z
+// Generated: 2026-06-23T20:55:23.412Z
 /**
  * Validation rules derived from parameter-schema-v2.json.

package/src/lib/mcp-client.js CHANGED Viewed

@@ -143,9 +143,23 @@ class McpClient {
         // Build context from bounded parameters that have defaults
         const context = this._buildContext();
+        // Auto-discover tool name if using the default (get_ml_config)
+        // Each server registers its own tool name (e.g. get_base_images, get_inference_endpoints)
+        let toolName = this.toolName;
+        if (toolName === DEFAULT_TOOL_NAME) {
+            try {
+                const toolList = await this._client.listTools();
+                if (toolList && toolList.tools && toolList.tools.length > 0) {
+                    toolName = toolList.tools[0].name;
+                }
+            } catch (_listErr) {
+                // Fall through to use default tool name
+            }
+        }
         // Call the configured tool
         const result = await this._client.callTool({
-            name: this.toolName,
+            name: toolName,
             arguments: {
                 parameters: unboundedParams,
                 limit: this.limit,

package/src/lib/mcp-query-runner.js CHANGED Viewed

@@ -371,9 +371,13 @@ export default class McpQueryRunner {
         console.log('   🔍 Querying endpoint-picker...');
         try {
+            // Pass awsProfile from bootstrap config for credential resolution
+            const awsProfile = this.runner.configManager?.config?.awsProfile
+                || this.runner.options?.profile || process.env.AWS_PROFILE || null;
             const result = await cm.queryMcpServer('endpoint-picker', {
                 awsRegion: infraAnswers.awsRegion,
-                deploymentTarget: 'realtime-inference'
+                deploymentTarget: 'realtime-inference',
+                ...(awsProfile ? { awsProfile } : {})
             });
             if (result && result.choices?.endpointName?.length > 0) {

package/src/lib/prompt-runner.js CHANGED Viewed

@@ -224,25 +224,39 @@ export default class PromptRunner {
         // Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
         let existingEndpointAnswers = {};
         if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
-            // Query endpoint-picker MCP server for available endpoints
-            const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
-            await this.mcpQueryRunner._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
-            const endpointPreviousAnswers = {
-                ...regionAndTargetAnswers,
-                ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
-            };
-            existingEndpointAnswers = await this._runPhase(
-                infraExistingEndpointPrompts,
-                endpointPreviousAnswers,
+            // First ask if user wants to attach to existing endpoint (no MCP call yet)
+            const attachAnswer = await this._runPhase(
+                [infraExistingEndpointPrompts[0]],
+                { ...regionAndTargetAnswers },
                 explicitConfig,
                 existingConfig
             );
-            // Resolve custom endpoint name
-            if (existingEndpointAnswers.customExistingEndpointName) {
-                existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
-                delete existingEndpointAnswers.customExistingEndpointName;
+            if (attachAnswer.useExistingEndpoint === 'yes') {
+                // Only now query endpoint-picker MCP server
+                const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
+                await this.mcpQueryRunner._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
+                const endpointPreviousAnswers = {
+                    ...regionAndTargetAnswers,
+                    ...attachAnswer,
+                    ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
+                };
+                existingEndpointAnswers = await this._runPhase(
+                    infraExistingEndpointPrompts.slice(1),
+                    endpointPreviousAnswers,
+                    explicitConfig,
+                    existingConfig
+                );
+                existingEndpointAnswers.useExistingEndpoint = 'yes';
+                // Resolve custom endpoint name
+                if (existingEndpointAnswers.customExistingEndpointName) {
+                    existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
+                    delete existingEndpointAnswers.customExistingEndpointName;
+                }
+            } else {
+                existingEndpointAnswers = attachAnswer;
             }
         }
@@ -376,11 +390,12 @@ export default class PromptRunner {
             const sizerRecs = this._instanceSizerMetadata.recommendations || [];
             const finalInstanceType = instanceAnswers.customInstanceType || instanceAnswers.instanceType;
             const matchingRec = sizerRecs.find(r => r.instanceType === finalInstanceType);
-            const tpRec = matchingRec || sizerRecs[0];
-            if (tpRec && tpRec.tensorParallelism > 1) {
-                this._autoTensorParallelism = tpRec.tensorParallelism;
-                this._autoGpuCount = tpRec.gpuCount;
-                console.log(`   ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
+            // Only use sizer TP recommendation if user selected a recommended instance
+            // Custom instances resolve TP from the instance catalog in template-variable-resolver
+            if (matchingRec && matchingRec.tensorParallelism > 1) {
+                this._autoTensorParallelism = matchingRec.tensorParallelism;
+                this._autoGpuCount = matchingRec.gpuCount;
+                console.log(`   ✓ Auto-set tensor parallelism: TP=${matchingRec.tensorParallelism} (${matchingRec.gpuCount} GPUs)`);
             }
             // Display capacity type confirmation for selected instance

package/templates/do/.benchmark_writer.py CHANGED Viewed

@@ -1401,8 +1401,18 @@ def _load_config_file(config_path):
                     'BASE_IMAGE_VERSION': 'base_image_version',
                     'BENCHMARK_CONCURRENCY': 'benchmark_concurrency',
                 }
+                # Also capture IC_ENV_* serving config vars
+                ic_env_map = {
+                    'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
+                    'IC_ENV_VLLM_QUANTIZATION': 'quantization',
+                    'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
+                    'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
+                    'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
+                }
                 if key in shell_map:
                     context[shell_map[key]] = value
+                elif key in ic_env_map:
+                    context[ic_env_map[key]] = value
     except Exception:
         pass
@@ -1419,6 +1429,30 @@ def _load_config_file(config_path):
         parts = context['model_name'].rstrip('/').split('/')
         context['model_name'] = parts[-1] if parts else context['model_name']
+    # Also scan IC config files (do/ic/*.conf) for IC_ENV_* serving params
+    # These override do/config values for serving-specific settings
+    try:
+        import glob
+        config_dir = os.path.dirname(os.path.abspath(config_path))
+        ic_dir = os.path.join(config_dir, 'ic')
+        ic_env_map = {
+            'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
+            'IC_ENV_VLLM_QUANTIZATION': 'quantization',
+            'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
+            'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
+            'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
+        }
+        for conf_file in sorted(glob.glob(os.path.join(ic_dir, '*.conf'))):
+            with open(conf_file, 'r') as f:
+                for line in f:
+                    match = re.match(r'^export\s+([A-Z_][A-Z0-9_]*)=["\']?([^"\']*)["\']?\s*$', line.strip())
+                    if match:
+                        key, value = match.group(1), match.group(2)
+                        if key in ic_env_map and value:
+                            context[ic_env_map[key]] = value
+    except Exception:
+        pass  # IC config scanning is best-effort
     return context

package/templates/do/.register_helper.py CHANGED Viewed

@@ -89,6 +89,8 @@ def _truncate_metadata(props):
     result = {}
     for key, value in props.items():
         str_val = str(value) if value is not None else ""
+        if not str_val:
+            continue  # SageMaker requires min length 1 for metadata values — skip empty
         if len(str_val) > MAX_METADATA_VALUE_LEN:
             _warn(f"Metadata '{key}' truncated ({len(str_val)} → {MAX_METADATA_VALUE_LEN} chars)")
             str_val = str_val[: MAX_METADATA_VALUE_LEN - 1] + "…"
@@ -264,33 +266,41 @@ def cmd_register_model(args):
     container_image = args.container_image or ""
     model_data_url = args.model_data_url or ""
-    inference_spec = {
-        "Containers": [
-            {
-                "Image": container_image,
-            }
-        ],
-        "SupportedContentTypes": ["application/json"],
-        "SupportedResponseMIMETypes": ["application/json"],
-    }
-    # Only include ModelDataUrl if provided
-    if model_data_url:
-        inference_spec["Containers"][0]["ModelDataUrl"] = model_data_url
     # Step 4: Create Model Package version (AC-1.2, AC-1.7)
     description = f"{args.deployment_config or 'model'} on {args.instance_type or 'unknown'}"
     print(f"Registering model version in {project_name}...", file=sys.stderr)
     try:
-        pkg = ModelPackage.create(
-            model_package_group_name=project_name,
-            model_package_description=description,
-            inference_specification=inference_spec,
-            customer_metadata_properties=metadata,
-            model_approval_status="Approved",
-        )
+        # Use boto3 directly — sagemaker-core v2.14 has a KeyError bug in ModelPackage.create()
+        # where it tries to read response["ModelPackageName"] but the API returns "ModelPackageArn".
+        import boto3
+        sm_client = boto3.client("sagemaker", region_name=region)
+        create_params = {
+            "ModelPackageGroupName": project_name,
+            "ModelPackageDescription": description,
+            "ModelApprovalStatus": "Approved",
+        }
+        if container_image:
+            create_params["InferenceSpecification"] = {
+                "Containers": [{"Image": container_image}],
+                "SupportedContentTypes": ["application/json"],
+                "SupportedResponseMIMETypes": ["application/json"],
+            }
+            if model_data_url:
+                create_params["InferenceSpecification"]["Containers"][0]["ModelDataUrl"] = model_data_url
+        if model_data_url:
+            if "InferenceSpecification" not in create_params:
+                # Store model data URL in metadata if no container image
+                if not metadata:
+                    metadata = {}
+                metadata["modelDataUrl"] = model_data_url[:1024]
+        if metadata:
+            create_params["CustomerMetadataProperties"] = metadata
+        response = sm_client.create_model_package(**create_params)
+        model_package_arn = response["ModelPackageArn"]
-        model_package_arn = pkg.model_package_arn
         # Extract version number from ARN (format: .../project-name/version)
         version = _extract_version_from_arn(model_package_arn)
@@ -407,33 +417,39 @@ def cmd_register_adapter(args):
     container_image = args.container_image or ""
     model_data_url = args.model_data_url or ""
-    inference_spec = {
-        "Containers": [
-            {
-                "Image": container_image,
-            }
-        ],
-        "SupportedContentTypes": ["application/json"],
-        "SupportedResponseMIMETypes": ["application/json"],
-    }
-    if model_data_url:
-        inference_spec["Containers"][0]["ModelDataUrl"] = model_data_url
     # Step 4: Create adapter Model Package version (AC-2.1)
     technique = args.tune_technique or "unknown"
     description = f"adapter ({technique}) on {args.instance_type or 'unknown'}, parent: {parent_version_arn}"
     print(f"Registering adapter version in {project_name}...", file=sys.stderr)
     try:
-        pkg = ModelPackage.create(
-            model_package_group_name=project_name,
-            model_package_description=description,
-            inference_specification=inference_spec,
-            customer_metadata_properties=metadata,
-            model_approval_status="Approved",
-        )
+        # Use boto3 directly — sagemaker-core v2.14 has a KeyError bug in ModelPackage.create()
+        import boto3
+        sm_client = boto3.client("sagemaker", region_name=region)
+        create_params = {
+            "ModelPackageGroupName": project_name,
+            "ModelPackageDescription": description,
+            "ModelApprovalStatus": "Approved",
+        }
+        if container_image:
+            create_params["InferenceSpecification"] = {
+                "Containers": [{"Image": container_image}],
+                "SupportedContentTypes": ["application/json"],
+                "SupportedResponseMIMETypes": ["application/json"],
+            }
+            if model_data_url:
+                create_params["InferenceSpecification"]["Containers"][0]["ModelDataUrl"] = model_data_url
+        elif model_data_url:
+            if not metadata:
+                metadata = {}
+            metadata["modelDataUrl"] = model_data_url[:1024]
+        if metadata:
+            create_params["CustomerMetadataProperties"] = metadata
+        response = sm_client.create_model_package(**create_params)
+        model_package_arn = response["ModelPackageArn"]
-        model_package_arn = pkg.model_package_arn
         version = _extract_version_from_arn(model_package_arn)
         print(f"Registered adapter version {version}: {model_package_arn}", file=sys.stderr)
@@ -1133,6 +1149,12 @@ def main():
         parser.print_help()
         sys.exit(1)
+    # Set region before any sagemaker-core import (creates boto3 clients at import time)
+    region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
+    if region:
+        os.environ['AWS_DEFAULT_REGION'] = region
+        os.environ.setdefault('AWS_REGION', region)
     if args.command == "create-mpg":
         cmd_create_mpg(args)
     elif args.command == "register-model":

package/templates/do/benchmark CHANGED Viewed

@@ -117,10 +117,12 @@ if [ "${ARG_STATUS}" = true ]; then
                     tar_file=""
                     tar_file=$(find "${LOCAL_RESULTS_DIR}" -name "output.tar.gz" -type f 2>/dev/null | head -1)
                     if [ -n "${tar_file}" ]; then
-                        # Detect whether tar has a leading directory prefix
-                        _tar_first=""
-                        _tar_first=$(tar -tzf "${tar_file}" 2>/dev/null | head -1)
-                        if echo "${_tar_first}" | grep -qE '^[^/]+/$'; then
+                        # Detect whether ALL entries share a common leading directory prefix
+                        _tar_prefix_count=""
+                        _tar_prefix_count=$(tar -tzf "${tar_file}" 2>/dev/null | sed 's|/.*||' | sort -u | wc -l | tr -d ' ')
+                        _tar_first_dir=""
+                        _tar_first_dir=$(tar -tzf "${tar_file}" 2>/dev/null | head -1)
+                        if [ "${_tar_prefix_count}" = "1" ] && echo "${_tar_first_dir}" | grep -qE '^[^/]+/$'; then
                             tar -xzf "${tar_file}" --strip-components=1 -C "${LOCAL_RESULTS_DIR}/output/" 2>/dev/null || true
                         else
                             tar -xzf "${tar_file}" -C "${LOCAL_RESULTS_DIR}/output/" 2>/dev/null || true
@@ -1097,11 +1099,14 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
             # Extract any tar.gz archives (benchmark service packages results as output.tar.gz)
             for ARCHIVE in $(find "${LOCAL_RESULTS_DIR}" -name "*.tar.gz" -type f 2>/dev/null); do
                 ARCHIVE_DIR=$(dirname "${ARCHIVE}")
-                # Detect whether tar has a leading directory prefix to strip.
-                # Some AIPerf versions wrap in output/, others are flat.
-                _TAR_FIRST=$(tar -tzf "${ARCHIVE}" 2>/dev/null | head -1)
-                if echo "${_TAR_FIRST}" | grep -qE '^[^/]+/$'; then
-                    # Leading directory (e.g., "output/") — strip it
+                # Detect whether ALL entries share a common leading directory prefix.
+                # Only strip if every entry starts with the same dir (e.g., "output/file1", "output/file2").
+                # A flat archive with mixed top-level files/dirs (e.g., "plots/", "profile_export.jsonl")
+                # must NOT be stripped.
+                _TAR_PREFIX=$(tar -tzf "${ARCHIVE}" 2>/dev/null | sed 's|/.*||' | sort -u | wc -l | tr -d ' ')
+                _TAR_FIRST_DIR=$(tar -tzf "${ARCHIVE}" 2>/dev/null | head -1)
+                if [ "${_TAR_PREFIX}" = "1" ] && echo "${_TAR_FIRST_DIR}" | grep -qE '^[^/]+/$'; then
+                    # Single common leading directory (e.g., all under "output/") — strip it
                     tar -xzf "${ARCHIVE}" --strip-components=1 -C "${ARCHIVE_DIR}" 2>/dev/null || true
                 else
                     # Flat archive — extract as-is

package/templates/do/register CHANGED Viewed

@@ -1137,9 +1137,14 @@ ml-container-creator "${CMD_ARGS[@]}"
 # ============================================================
 # Container image URI for the deployed model
-CONTAINER_IMAGE_URI="${ECR_REPOSITORY_NAME}:${PROJECT_NAME}-latest"
-if [ -n "${BASE_IMAGE:-}" ]; then
-    CONTAINER_IMAGE_URI="${BASE_IMAGE}"
+# Build full ECR URI from profile (accountId + region + repoName + tag)
+_ACCOUNT_ID="${_PROFILE_accountId:-}"
+_REGION="${AWS_DEFAULT_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
+if [ -n "${_ACCOUNT_ID}" ] && [ -n "${ECR_REPOSITORY_NAME}" ]; then
+    CONTAINER_IMAGE_URI="${_ACCOUNT_ID}.dkr.ecr.${_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}:${PROJECT_NAME}-latest"
+else
+    # No ECR info available — MPG will be registered without InferenceSpecification
+    CONTAINER_IMAGE_URI=""
 fi
 # Model data S3 URI (from do/config if set)

package/templates/do/tune CHANGED Viewed

@@ -829,7 +829,25 @@ _validate_dataset() {
     if [ -z "${dataset}" ]; then
         if [ -n "${ARG_DATASET_NAME}" ]; then
             # Name-based resolution happens below via resolve-dataset
-            :
+            echo "🔍 Resolving dataset '${ARG_DATASET_NAME}' from registry..."
+            local resolve_result
+            resolve_result=$(python3 "${SCRIPT_DIR}/.register_helper.py" resolve-dataset \
+                --name "${ARG_DATASET_NAME}" 2>/dev/null) || resolve_result=""
+            if [ -n "${resolve_result}" ]; then
+                local resolved_uri
+                resolved_uri=$(echo "${resolve_result}" | grep -E '^\{' | tail -1 | python3 -c "import sys,json; print(json.load(sys.stdin).get('s3_uri',''))" 2>/dev/null) || resolved_uri=""
+                if [ -n "${resolved_uri}" ]; then
+                    echo "   Resolved to: ${resolved_uri}"
+                    dataset="${resolved_uri}"
+                    RESOLVED_DATASET_S3_URI="${resolved_uri}"
+                    return 0
+                fi
+            fi
+            echo "❌ Dataset '${ARG_DATASET_NAME}' not found in registry"
+            echo "   Run ./do/tune --list-datasets to see available datasets."
+            echo "   Register: ./do/register dataset <name> --s3-uri <uri> --technique <sft|dpo>"
+            exit 1
         else
             echo "❌ --dataset is required"
             echo "   Provide an S3 URI (s3://bucket/path.jsonl), HF reference (hf://org/name), or registered name"