npm - @aws/ml-container-creator - Versions diffs - 1.0.0 → 1.0.3 - Mend

@aws/ml-container-creator 1.0.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/bin/cli.js +1 -1
package/config/tune-catalog.json +303 -1
package/package.json +2 -1
package/servers/endpoint-picker/index.js +24 -4
package/servers/lib/catalogs/model-servers.json +334 -120
package/src/lib/bootstrap-command-handler.js +20 -2
package/src/lib/bootstrap-profile-manager.js +33 -0
package/src/lib/bootstrap-provisioners.js +48 -0
package/src/lib/cross-cutting-checker.js +6 -1
package/src/lib/generated/cli-options.js +1 -1
package/src/lib/generated/parameter-matrix.js +1 -1
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/path-prover-brain.js +57 -0
package/src/lib/prove-pipeline-executor.js +35 -0
package/templates/do/.benchmark_writer.py +114 -4
package/templates/do/.register_helper.py +643 -67
package/templates/do/.stage_helper.py +1 -0
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
package/templates/do/adapter +267 -171
package/templates/do/benchmark +60 -5
package/templates/do/config +1 -1
package/templates/do/lib/inference-component.sh +6 -25
package/templates/do/register +29 -2
package/templates/do/tune +94 -12

package/src/lib/bootstrap-command-handler.js CHANGED Viewed

@@ -52,6 +52,7 @@ export default class BootstrapCommandHandler {
     _setupS3Buckets() { return this.provisioners._setupS3Buckets(); }
     _createS3Bucket(name, tags) { return this.provisioners._createS3Bucket(name, tags); }
     _verifyCliV2() { return this.provisioners._verifyCliV2(); }
+    _provisionAiRegistryHub(profileData) { return this.provisioners.provisionAiRegistryHub(profileData); }
     // ── ProfileManager delegations (backward compat for tests) ──────
@@ -63,6 +64,7 @@ export default class BootstrapCommandHandler {
     _handlePrune() { return this.profileManager._handlePrune(); }
     _handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
     _handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
+    _handleSyncServingVersions() { return this.profileManager._handleSyncServingVersions(); }
     /**
      * Dispatch bootstrap subcommands.
@@ -131,6 +133,9 @@ export default class BootstrapCommandHandler {
         case 'sync-model-families':
             await this._handleSyncModelFamilies();
             break;
+        case 'sync-serving-versions':
+            await this._handleSyncServingVersions();
+            break;
         // Migration path: upgrades legacy profiles to current naming conventions.
         // Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
         // to sharedInfraFrom. Idempotent — safe to run multiple times.
@@ -357,6 +362,9 @@ export default class BootstrapCommandHandler {
             console.log('     Tune jobs will still work but experiment tracking may not be available.');
         }
+        // Step 4c: AI Registry Hub
+        await this._provisionAiRegistryHub(profileData);
         // Step 5: CI Infrastructure setup (separate CDK stack — unchanged)
         this._displayProgress('🧪', 'CI Testing Infrastructure...');
         try {
@@ -714,6 +722,10 @@ export default class BootstrapCommandHandler {
             console.log(`  ⚠️  MLflow App setup skipped: ${error.message}`);
         }
+        // Ensure AI Registry hub exists
+        this._currentProfile = profileConfig.awsProfile;
+        await this._provisionAiRegistryHub(profileConfig);
         // Save updated profile
         this.config.setProfile(name, profileConfig);
         console.log(`\n✅ Update complete for profile "${name}"`);
@@ -1459,7 +1471,9 @@ SUBCOMMANDS:
   prune                               Remove deleted and unknown records from the deployment manifest
   update                              Re-deploy bootstrap stacks using active profile (no prompts)
   migrate                             Upgrade legacy profiles to current naming conventions
+  sync-schemas                        Download AWS service model schemas (sagemaker, iam, ecr, s3)
   sync-model-families                 Discover tune-eligible models from JumpStart Hub and update catalog
+  sync-serving-versions               Discover latest vLLM/SGLang/TRT-LLM image versions and update catalog
 SETUP OPTIONS:
   --non-interactive                   Run without interactive prompts
@@ -1469,8 +1483,10 @@ SETUP OPTIONS:
   --role-arn <arn>                    Use existing IAM role ARN (skip role creation)
   --skip-s3                           Skip S3 bucket creation
   --ci                                Provision CI testing infrastructure
+  --benchmark-infra                   Provision Athena/Glue benchmark infrastructure (requires --ci)
   --skip-ci                           Skip CI infrastructure provisioning
   --skip-post-setup                   Skip post-setup chain (mcp init, sync-architectures, sync-schemas)
+  --ignore-staleness                  Suppress schema staleness warnings
 STATUS OPTIONS:
   --verify                            Check each active resource against AWS APIs for drift detection
@@ -1487,13 +1503,15 @@ EXAMPLES:
   ml-container-creator bootstrap list
   ml-container-creator bootstrap remove dev
   ml-container-creator bootstrap remove dev --force --delete-stack
+  ml-container-creator bootstrap update
+  ml-container-creator bootstrap update --ci --benchmark-infra
   ml-container-creator bootstrap scan
+  ml-container-creator bootstrap sync-schemas
   ml-container-creator bootstrap sync-model-families
+  ml-container-creator bootstrap sync-serving-versions
   ml-container-creator bootstrap migrate
   ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
-  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
   ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci
-  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --skip-ci
 `);
     }

package/src/lib/bootstrap-profile-manager.js CHANGED Viewed

@@ -172,6 +172,23 @@ export default class BootstrapProfileManager {
             }
         }
+        // Check AI Registry hub status
+        if (profile.config.aiRegistryHubName) {
+            try {
+                const hubExists = this.handler._resourceExists(
+                    `sagemaker describe-hub --hub-name ${profile.config.aiRegistryHubName} --region ${profile.config.awsRegion}`,
+                    profile.config.awsProfile
+                );
+                console.log(hubExists
+                    ? `  ✅ AI Registry hub: ${profile.config.aiRegistryHubName}`
+                    : `  ⚠️  AI Registry hub: ${profile.config.aiRegistryHubName} — missing`);
+            } catch {
+                console.log(`  ⚠️  AI Registry hub: ${profile.config.aiRegistryHubName} — could not validate`);
+            }
+        } else {
+            console.log('  ℹ️  AI Registry hub: not provisioned (run bootstrap to create)');
+        }
         // Display deployed resources from manifest
         console.log('\n📦 Deployed Resources:');
@@ -638,4 +655,20 @@ export default class BootstrapProfileManager {
             process.exit(1);
         }
     }
+    /**
+     * Handle sync-serving-versions subcommand: discover latest container image
+     * versions for vLLM, SGLang, and TensorRT-LLM and update the model-servers catalog.
+     */
+    async _handleSyncServingVersions() {
+        console.log('\n🔄 Sync Serving Versions — Discovering latest container images...\n');
+        try {
+            const { syncServingVersions } = await import('../../scripts/sync-serving-versions.js');
+            const result = await syncServingVersions();
+            console.log(`\n✅ Sync complete: ${result.totalAdded} new, ${result.totalRemoved} pruned\n`);
+        } catch (err) {
+            console.log(`❌ Sync failed: ${err.message}`);
+            process.exit(1);
+        }
+    }
 }

package/src/lib/bootstrap-provisioners.js CHANGED Viewed

@@ -405,6 +405,54 @@ export default class BootstrapProvisioners {
         }
     }
+    /**
+     * Provision a deterministic SageMaker AI Registry Hub.
+     * Idempotent: checks if `mlcc-registry-{accountId}` already exists before creating.
+     * Non-fatal: catches all errors and prints a warning — bootstrap continues regardless.
+     *
+     * @param {object} profileData - Profile data object (mutated in place with hub info)
+     */
+    async provisionAiRegistryHub(profileData) {
+        const hubName = `mlcc-registry-${profileData.accountId}`;
+        const region = profileData.awsRegion;
+        console.log('\n📦 Provisioning AI Registry hub...');
+        try {
+            // Check if hub already exists (idempotent)
+            const hubExists = this.handler._resourceExists(
+                `sagemaker describe-hub --hub-name ${hubName} --region ${region}`,
+                this.handler._currentProfile
+            );
+            if (hubExists) {
+                const hubInfo = this.handler._execAws(
+                    `sagemaker describe-hub --hub-name ${hubName} --region ${region}`,
+                    this.handler._currentProfile
+                );
+                console.log(`  ✅ AI Registry hub already provisioned: ${hubName}`);
+                profileData.aiRegistryHubName = hubName;
+                profileData.aiRegistryHubArn = hubInfo.HubArn;
+                return;
+            }
+            // Create new hub (always — no adopt-existing logic)
+            const tags = this._buildResourceTags();
+            const tagsFile = this.handler._formatTagsForCli(tags);
+            const createResult = this.handler._execAws(
+                `sagemaker create-hub --hub-name ${hubName} --hub-display-name "MCC AI Registry" --hub-description "Dataset, evaluator, and model versioning for ml-container-creator" --tags ${tagsFile} --region ${region}`,
+                this.handler._currentProfile
+            );
+            console.log(`  ✅ AI Registry hub "${hubName}" — created`);
+            profileData.aiRegistryHubName = hubName;
+            profileData.aiRegistryHubArn = createResult.HubArn;
+        } catch (err) {
+            const message = err.message || String(err);
+            console.log(`  ⚠️  Could not provision AI Registry hub (non-fatal): ${message}`);
+            console.log('     Dataset registration will use local JSON registry.');
+        }
+    }
     /**
      * Build the standard resource tag set.
      * @returns {Array<{Key: string, Value: string}>} Tag array

package/src/lib/cross-cutting-checker.js CHANGED Viewed

@@ -290,7 +290,12 @@ export default class CrossCuttingChecker {
         if (!modelType || !server || !serverVersion) return findings;
         const entries = modelServersCatalog[server] || [];
-        const entry = entries.find(e => e.labels?.framework_version === serverVersion);
+        // Try exact version match first, then fall back to nearest entry with supportedModelTypes
+        let entry = entries.find(e => e.labels?.framework_version === serverVersion);
+        if (!entry?.supportedModelTypes?.length) {
+            // Fall back to any entry that has supportedModelTypes populated
+            entry = entries.find(e => e.supportedModelTypes?.length > 0);
+        }
         if (!entry?.supportedModelTypes?.length) return findings;
         if (!entry.supportedModelTypes.includes(modelType.toLowerCase())) {

package/src/lib/generated/cli-options.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-23T20:55:23.381Z
+// Generated: 2026-06-29T13:37:06.271Z
 /**
  * CLI option definitions derived from parameter-schema-v2.json.

package/src/lib/generated/parameter-matrix.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-23T20:55:23.482Z
+// Generated: 2026-06-29T13:37:06.375Z
 /**
  * Parameter matrix defining how each parameter is loaded from various sources.

package/src/lib/generated/validation-rules.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-validator.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-23T20:55:23.412Z
+// Generated: 2026-06-29T13:37:06.303Z
 /**
  * Validation rules derived from parameter-schema-v2.json.

package/src/lib/path-prover-brain.js CHANGED Viewed

@@ -16,6 +16,24 @@ const __dirname = dirname(__filename);
  * classifies failures, gates tune/adapter stages, and builds
  * Athena-compatible records with run_type='path_prove'.
  *
+ * ## Module Status (AC-1.4)
+ *
+ * ALL exported functions are FULLY FUNCTIONAL:
+ * - `identifyGaps()` — Cartesian product gap finder, prioritized by neighbor count
+ * - `findNearestSubstitution()` — Hamming distance nearest-neighbor, same-family constraint
+ * - `classifyFailure()` — regex pattern matching to 6 categories (capacity, timeout, oom, code_bug, model_incompatibility, service_limitation)
+ * - `shouldExecuteTuneStages()` — gating logic for tune/adapter stages
+ * - `hammingDistance()` — config vector comparison across CONFIG_DIMENSIONS
+ * - `buildPathProverRecord()` — Athena record construction with run_type='path_prove'
+ * - `findUnfeasibleRecord()` — checks if a config is known-unfeasible to prevent repeated attempts
+ * - `getNextPriorityConfig()` — priority queue management for v1 validation mode
+ * - `updatePriorityStatus()` — updates target status after prove attempts
+ * - `getPriorityQueueStatus()` — summary counts for priority queue
+ * - `loadPriorityTargets()` — file-based priority target loading
+ * - `resolveProveTpDegree()` — TP degree auto-resolution from instance catalog
+ *
+ * This is stabilization (tests + docs), not implementation. No new logic needed.
+ *
  * Feature: ci-benchmark-pipeline
  * Requirements: 8.1–8.12
  */
@@ -611,6 +629,45 @@ export function loadPriorityTargets(configPath) {
     }
 }
+// ── Optimization Space Schema (Task 3 — AC-3.5) ─────────────────────────────
+/**
+ * Load the optimization search space schema from config/optimization-space.json.
+ *
+ * Returns the parsed schema with dimensions, version, and description.
+ * Used by gap identification to enumerate sweepable dimensions and their
+ * allowed values for the optimization/prove sweep.
+ *
+ * @returns {object|null} Parsed schema object, or null if file not found/invalid
+ */
+export function loadOptimizationSpace() {
+    try {
+        const schemaPath = resolve(__dirname, '..', '..', 'config', 'optimization-space.json');
+        const raw = readFileSync(schemaPath, 'utf8');
+        return JSON.parse(raw);
+    } catch {
+        return null;
+    }
+}
+/**
+ * Get the list of sweepable dimension names from the optimization space schema.
+ *
+ * Filters dimensions by status === 'sweepable' and returns their keys.
+ * Useful for verifying sync between CONFIG_DIMENSIONS and the schema.
+ *
+ * @param {object} [schema] - Pre-loaded schema (loads from file if omitted)
+ * @returns {string[]} Array of sweepable dimension names
+ */
+export function getSweepableDimensions(schema = null) {
+    const data = schema || loadOptimizationSpace();
+    if (!data || !data.dimensions) return [];
+    return Object.keys(data.dimensions).filter(
+        key => data.dimensions[key].status === 'sweepable'
+    );
+}
 // ── TP Degree Auto-Resolution at Prove-Time (Task 6.5) ──────────────────────
 /**

package/src/lib/prove-pipeline-executor.js CHANGED Viewed

@@ -8,6 +8,25 @@
  * Handles stage-specific logic including idempotency checks, status tracking,
  * and fail-fast behavior.
  *
+ * ## Module Status (AC-1.4)
+ *
+ * FUNCTIONAL stages:
+ * - `executeStageStep()` — fully wired with idempotency via `.mlcc/staged-assets.json`
+ * - `isAlreadyStaged()` — checks staged assets existence and validity
+ * - `getStagingState()` — resolves current staging state from filesystem + step results
+ * - `isValidLifecycleStage()` — validates individual stage names
+ * - `validateStagesArray()` — validates arrays of stage names
+ * - `formatStagingStatus()` — formats staging state for display
+ * - `buildTargetStatus()` — builds status summary for a prove target
+ *
+ * INTENTIONALLY INCOMPLETE (post-v1 scope):
+ * - Other lifecycle stage executors (build, push, deploy, test, tune, adapter,
+ *   test-adapter, benchmark, register, clean) are NOT implemented.
+ * - Only the `stage` step has execution logic. Other stages are recognized in
+ *   validation but have no executor function.
+ * - This is not "broken" — these were never finished before the laptop was bricked.
+ *   They are explicitly post-v1 scope.
+ *
  * Feature: s3-model-loading
  * Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
  */
@@ -40,6 +59,22 @@ export const VALID_LIFECYCLE_STAGES = [
     'clean'
 ];
+// TODO(post-v1): Implement executor functions for lifecycle stages beyond 'stage'.
+// The following stages are recognized for validation purposes but have no execution logic:
+//   - generate: Should invoke `mcc generate` to produce project scaffolding
+//   - build: Should run `do/build` to build the Docker container
+//   - push: Should run `do/push` to push container to ECR
+//   - deploy: Should run `do/deploy` to create SageMaker endpoint
+//   - test: Should run `do/test` to invoke endpoint and verify correctness
+//   - tune: Should run `do/tune` for fine-tuning jobs (gated by shouldExecuteTuneStages)
+//   - adapter: Should run `do/adapter` for LoRA adapter serving
+//   - test-adapter: Should test adapter endpoints after deployment
+//   - benchmark: Should run `do/benchmark` for performance measurement
+//   - register: Should register proven config in Athena/DynamoDB
+//   - clean: Should tear down deployed resources
+// These were never finished before the original developer's laptop was bricked.
+// They are explicitly post-v1 scope, not "broken" code.
 /**
  * Possible staging states for status output.
  */

package/templates/do/.benchmark_writer.py CHANGED Viewed

@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark Writer — Converts do/benchmark output to enriched Parquet for Athena.
@@ -340,7 +338,7 @@ def _extract_base_image_version(base_image):
     return ''
-def enrich_records(config, results, run_timestamp=None):
+def enrich_records(config, results, run_timestamp=None, instance_catalog=None):
     """Build enriched records from config context and benchmark results.
     Each metrics entry becomes one enriched record with all Athena columns populated.
@@ -349,6 +347,7 @@ def enrich_records(config, results, run_timestamp=None):
         config: dict with config context fields (project_name, model_name, etc.)
         results: dict with benchmark results (job_name, metrics array)
         run_timestamp: Optional datetime for run_timestamp. Defaults to now UTC.
+        instance_catalog: Optional pre-loaded instance catalog dict. If None, loaded from disk.
     Returns:
         list of enriched record dicts (one per concurrency level).
@@ -364,10 +363,21 @@ def enrich_records(config, results, run_timestamp=None):
     # Derived fields
     model_family = derive_model_family(model_name)
+    instance_family = derive_instance_family(instance_type)
+    # Resolve instance metadata from catalog (AC-2.8)
+    hw_meta = resolve_instance_metadata(instance_type, instance_catalog)
+    gpu_count = hw_meta['gpu_count']
+    gpu_type = hw_meta['gpu_type']
+    gpu_memory_gb = hw_meta['gpu_memory_gb']
     # Optional context fields
     deployment_target = config.get('deployment_target', 'realtime-inference')
-    tensor_parallel_degree = config.get('tensor_parallel_degree', 1)
+    try:
+        tensor_parallel_degree = int(config.get('tensor_parallel_degree', 1))
+    except (ValueError, TypeError):
+        tensor_parallel_degree = 1
     quantization = config.get('quantization', 'none')
     enable_lora = config.get('enable_lora', False)
     base_image = config.get('base_image', '')
@@ -377,6 +387,11 @@ def enrich_records(config, results, run_timestamp=None):
     ci_run_id = config.get('ci_run_id', '')
     account_id = config.get('account_id', '')
+    # Configuration dimensions (nullable)
+    max_model_len_raw = config.get('max_model_len')
+    max_model_len = int(max_model_len_raw) if max_model_len_raw not in (None, '', 0) else None
+    kv_cache_dtype = config.get('kv_cache_dtype') or None
     # Get metrics from results
     metrics = results.get('metrics', []) if isinstance(results, dict) else []
@@ -447,6 +462,13 @@ def enrich_records(config, results, run_timestamp=None):
             'deployment_target': deployment_target,
             'quantization': quantization,
             'tensor_parallel_degree': tensor_parallel_degree,
+            'instance_family': instance_family,
+            'gpu_count': gpu_count,
+            'gpu_type': gpu_type,
+            'gpu_memory_gb': gpu_memory_gb,
+            'max_model_len': max_model_len,
+            'enable_lora': enable_lora,
+            'kv_cache_dtype': kv_cache_dtype,
             'serving_config': json.dumps(serving_config_dict),
             'workload': config.get('workload', 'manual'),
             'concurrency': concurrency,
@@ -481,6 +503,7 @@ def enrich_records(config, results, run_timestamp=None):
             'output_sequence_length_avg': scalar(metric.get('output_sequence_length', metric.get('output_sequence_length_avg', 0.0))),
             'input_sequence_length_avg': scalar(metric.get('input_sequence_length', metric.get('input_sequence_length_avg', 0.0))),
             'error_rate': error_rate,
+            'cost_per_1m_tokens': cost,
             'benchmark_duration_sec': metric.get('benchmark_duration_sec', duration_seconds),
             'run_type': run_type,
             'benchmark_job_name': results.get('job_name', '') if isinstance(results, dict) else '',
@@ -792,6 +815,54 @@ def register_partition(bucket, model, instance, target,
 # ── Parquet Serialization ─────────────────────────────────────────────────────
+def load_instance_catalog():
+    """Load the instance catalog from servers/lib/catalogs/instances.json.
+    Resolves the path relative to the project root (two levels up from templates/do/).
+    Returns the 'catalog' dict mapping instance_type → metadata, or empty dict on failure.
+    Returns:
+        dict mapping instance type strings to their metadata dicts.
+    """
+    # Resolve relative to this file: templates/do/.benchmark_writer.py → project root
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    # Navigate up from templates/do/ to project root
+    project_root = os.path.normpath(os.path.join(this_dir, '..', '..'))
+    catalog_path = os.path.join(project_root, 'servers', 'lib', 'catalogs', 'instances.json')
+    try:
+        with open(catalog_path, 'r') as f:
+            data = json.load(f)
+        return data.get('catalog', {})
+    except (FileNotFoundError, json.JSONDecodeError, IOError):
+        return {}
+def resolve_instance_metadata(instance_type, instance_catalog=None):
+    """Resolve GPU metadata from the instance catalog for a given instance_type.
+    Args:
+        instance_type: SageMaker instance type (e.g., 'ml.g5.xlarge').
+        instance_catalog: Optional pre-loaded catalog dict. If None, loads from disk.
+    Returns:
+        dict with keys: gpu_count (int|None), gpu_type (str|None), gpu_memory_gb (float|None).
+        All values are None if instance_type is not found in catalog.
+    """
+    if instance_catalog is None:
+        instance_catalog = load_instance_catalog()
+    entry = instance_catalog.get(instance_type)
+    if entry is None:
+        return {'gpu_count': None, 'gpu_type': None, 'gpu_memory_gb': None}
+    return {
+        'gpu_count': entry.get('gpus'),
+        'gpu_type': entry.get('gpuType'),
+        'gpu_memory_gb': entry.get('gpuMemoryGb'),
+    }
 def get_parquet_schema():
     """Return the pyarrow schema matching the Athena DDL for benchmark_results.
@@ -814,6 +885,17 @@ def get_parquet_schema():
         pa.field("quantization", pa.string()),
         pa.field("tensor_parallel_degree", pa.int32()),
+        # Hardware metadata (resolved from instance catalog at write time)
+        pa.field("instance_family", pa.string()),
+        pa.field("gpu_count", pa.int32()),
+        pa.field("gpu_type", pa.string()),
+        pa.field("gpu_memory_gb", pa.float64()),
+        # Configuration dimensions (top-level for Athena queryability)
+        pa.field("max_model_len", pa.int32()),
+        pa.field("enable_lora", pa.bool_()),
+        pa.field("kv_cache_dtype", pa.string()),
         # Full serving config (extensible JSON blob)
         pa.field("serving_config", pa.string()),
@@ -852,6 +934,7 @@ def get_parquet_schema():
         pa.field("output_sequence_length_avg", pa.float64()),
         pa.field("input_sequence_length_avg", pa.float64()),
         pa.field("error_rate", pa.float64()),
+        pa.field("cost_per_1m_tokens", pa.float64()),
         pa.field("benchmark_duration_sec", pa.float64()),
         # Run Metadata
@@ -1182,6 +1265,9 @@ def cmd_write(args):
     if args.adapter_name:
         input_data['adapter_name'] = args.adapter_name
+    if getattr(args, 'instance_type', None):
+        input_data['instance_type'] = args.instance_type
     # ── Validate before any S3 interaction ────────────────────────────────
     errors = validate_benchmark_input(input_data)
     if errors:
@@ -1391,6 +1477,8 @@ def _load_config_file(config_path):
                     'MODEL_NAME': 'model_name',
                     'HF_MODEL_ID': 'hf_model_id',
                     'INSTANCE_TYPE': 'instance_type',
+                    'INSTANCE_POOLS': 'instance_pools',
+                    'BENCHMARK_INSTANCE_TYPE': 'benchmark_instance_type',
                     'DEPLOYMENT_CONFIG': 'deployment_config',
                     'DEPLOYMENT_TARGET': 'deployment_target',
                     'AWS_REGION': 'region',
@@ -1429,6 +1517,24 @@ def _load_config_file(config_path):
         parts = context['model_name'].rstrip('/').split('/')
         context['model_name'] = parts[-1] if parts else context['model_name']
+    # Resolve instance_type precedence:
+    #   BENCHMARK_INSTANCE_TYPE (live-resolved, persisted by do/benchmark) > INSTANCE_TYPE > INSTANCE_POOLS fallback
+    if context.get('benchmark_instance_type'):
+        context['instance_type'] = context.pop('benchmark_instance_type')
+    # Fall back to INSTANCE_POOLS when neither is set.
+    # Heterogeneous pool configs may not have a standalone INSTANCE_TYPE value
+    # but always define INSTANCE_POOLS as a JSON array with Priority fields.
+    if not context.get('instance_type') and context.get('instance_pools'):
+        try:
+            pools = json.loads(context['instance_pools'])
+            if pools:
+                # Pick the highest-priority (lowest number) instance
+                best = min(pools, key=lambda p: p.get('Priority', 999))
+                context['instance_type'] = best.get('InstanceType', '')
+        except (json.JSONDecodeError, TypeError, KeyError):
+            pass
+    context.pop('instance_pools', None)  # Don't leak raw JSON into record
     # Also scan IC config files (do/ic/*.conf) for IC_ENV_* serving params
     # These override do/config values for serving-specific settings
     try:
@@ -1505,6 +1611,10 @@ def main():
         help='LoRA adapter name (differentiates adapter benchmarks from base model in Athena)'
     )
+    write_parser.add_argument(
+        '--instance-type', dest='instance_type', default=None,
+        help='Override instance type (use when actual provisioned instance differs from config, e.g. heterogeneous pools)'
+    )
     write_parser.add_argument(
         '--dry-run', dest='dry_run', action='store_true',
         help='Output enriched records as JSON without writing to S3'