npm - @aws/ml-container-creator - Versions diffs - 0.9.1 → 0.10.3 - Mend

@aws/ml-container-creator 0.9.1 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/LICENSE-THIRD-PARTY +9304 -0
package/bin/cli.js +2 -0
package/config/bootstrap-e2e-stack.json +341 -0
package/config/bootstrap-stack.json +40 -3
package/config/parameter-schema-v2.json +2049 -0
package/config/tune-catalog.json +1781 -0
package/infra/ci-harness/buildspec.yml +1 -0
package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
package/package.json +53 -68
package/servers/base-image-picker/index.js +121 -121
package/servers/e2e-status/index.js +297 -0
package/servers/e2e-status/manifest.json +14 -0
package/servers/e2e-status/package.json +15 -0
package/servers/endpoint-picker/LICENSE +202 -0
package/servers/endpoint-picker/index.js +536 -0
package/servers/endpoint-picker/manifest.json +14 -0
package/servers/endpoint-picker/package.json +18 -0
package/servers/hyperpod-cluster-picker/index.js +125 -125
package/servers/instance-sizer/index.js +138 -138
package/servers/instance-sizer/lib/instance-ranker.js +76 -76
package/servers/instance-sizer/lib/model-resolver.js +61 -61
package/servers/instance-sizer/lib/quota-resolver.js +113 -113
package/servers/instance-sizer/lib/vram-estimator.js +31 -31
package/servers/lib/bedrock-client.js +38 -38
package/servers/lib/catalogs/jumpstart-public.json +101 -16
package/servers/lib/catalogs/model-servers.json +201 -3
package/servers/lib/catalogs/models.json +182 -26
package/servers/lib/custom-validators.js +13 -13
package/servers/lib/dynamic-resolver.js +4 -4
package/servers/marketplace-picker/index.js +342 -0
package/servers/marketplace-picker/manifest.json +14 -0
package/servers/marketplace-picker/package.json +18 -0
package/servers/model-picker/index.js +382 -382
package/servers/region-picker/index.js +56 -56
package/servers/workload-picker/LICENSE +202 -0
package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
package/servers/workload-picker/index.js +171 -0
package/servers/workload-picker/manifest.json +16 -0
package/servers/workload-picker/package.json +16 -0
package/src/app.js +4 -390
package/src/lib/bootstrap-command-handler.js +710 -1148
package/src/lib/bootstrap-config.js +36 -0
package/src/lib/bootstrap-profile-manager.js +641 -0
package/src/lib/bootstrap-provisioners.js +421 -0
package/src/lib/ci-register-helpers.js +74 -0
package/src/lib/config-loader.js +408 -0
package/src/lib/config-manager.js +66 -1685
package/src/lib/config-mcp-client.js +118 -0
package/src/lib/config-validator.js +634 -0
package/src/lib/cuda-resolver.js +149 -0
package/src/lib/e2e-catalog-validator.js +251 -3
package/src/lib/e2e-ci-recorder.js +103 -0
package/src/lib/generated/cli-options.js +315 -311
package/src/lib/generated/parameter-matrix.js +671 -0
package/src/lib/generated/validation-rules.js +71 -71
package/src/lib/marketplace-flow.js +276 -0
package/src/lib/mcp-query-runner.js +768 -0
package/src/lib/parameter-schema-validator.js +62 -18
package/src/lib/path-prover-brain.js +607 -0
package/src/lib/prompt-runner.js +41 -1504
package/src/lib/prompts/feature-prompts.js +172 -0
package/src/lib/prompts/index.js +48 -0
package/src/lib/prompts/infrastructure-prompts.js +690 -0
package/src/lib/prompts/model-prompts.js +552 -0
package/src/lib/prompts/project-prompts.js +82 -0
package/src/lib/prompts.js +2 -1446
package/src/lib/registry-command-handler.js +135 -3
package/src/lib/secrets-prompt-runner.js +251 -0
package/src/lib/template-variable-resolver.js +422 -0
package/src/lib/tune-catalog-validator.js +37 -4
package/templates/Dockerfile +9 -0
package/templates/code/adapter_sidecar.py +444 -0
package/templates/code/serve +6 -0
package/templates/code/serve.d/vllm.ejs +1 -1
package/templates/do/.benchmark_writer.py +1476 -0
package/templates/do/.tune_helper.py +982 -57
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/adapter +149 -0
package/templates/do/benchmark +639 -85
package/templates/do/config +108 -5
package/templates/do/deploy.d/managed-inference.ejs +192 -11
package/templates/do/optimize +106 -37
package/templates/do/register +89 -0
package/templates/do/test +13 -0
package/templates/do/tune +378 -59
package/templates/do/validate +44 -4
package/config/parameter-schema.json +0 -88

package/infra/ci-harness/buildspec.yml CHANGED Viewed

@@ -7,6 +7,7 @@ env:
     CONFIG_ID: ""
     CONFIG_JSON: ""
     BUILD_STRATEGY: "codebuild-submit"
+    BENCHMARK_CONCURRENCY_LEVELS: ""
 phases:
   install:

package/infra/ci-harness/lambda/path-prover/brain.ts ADDED Viewed

@@ -0,0 +1,306 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Path Prover Brain Lambda
+ *
+ * Handles three actions for the Path Prover state machine:
+ * - getNextConfig: Initial brain call to get the first config to prove
+ * - pickNext: After a prove iteration, decides next or done
+ * - classifyFailure: On error, classify and return structured result
+ *
+ * Budget controls: MAX_PROVES_PER_RUN (default 10), MAX_COST_PER_RUN (default 100 USD)
+ *
+ * Requirements: 8.1, 8.7, 8.8
+ */
+/**
+ * Configuration dimensions used for gap identification.
+ */
+const CONFIG_DIMENSIONS = [
+    'deployment_config',
+    'model_family',
+    'instance_family',
+    'quantization',
+    'tp_degree',
+    'deployment_target'
+]
+/**
+ * Error pattern matchers for failure classification.
+ */
+const ERROR_PATTERNS: Array<{ pattern: RegExp; category: string; retryable: boolean }> = [
+    { pattern: /InsufficientInstanceCapacity/i, category: 'capacity', retryable: true },
+    { pattern: /CapacityError/i, category: 'capacity', retryable: true },
+    { pattern: /no capacity/i, category: 'capacity', retryable: true },
+    { pattern: /timed?\s*out/i, category: 'timeout', retryable: true },
+    { pattern: /timeout/i, category: 'timeout', retryable: true },
+    { pattern: /deadline exceeded/i, category: 'timeout', retryable: true },
+    { pattern: /OutOfMemory/i, category: 'oom', retryable: false },
+    { pattern: /OOM/i, category: 'oom', retryable: false },
+    { pattern: /CUDA out of memory/i, category: 'oom', retryable: false },
+    { pattern: /Cannot allocate memory/i, category: 'oom', retryable: false },
+    { pattern: /template.*error/i, category: 'code_bug', retryable: false },
+    { pattern: /SyntaxError/i, category: 'code_bug', retryable: false },
+    { pattern: /ReferenceError/i, category: 'code_bug', retryable: false },
+    { pattern: /not supported.*model/i, category: 'model_incompatibility', retryable: false },
+    { pattern: /model.*incompatible/i, category: 'model_incompatibility', retryable: false },
+    { pattern: /LoRA.*not supported/i, category: 'model_incompatibility', retryable: false },
+    { pattern: /not available.*region/i, category: 'service_limitation', retryable: false },
+    { pattern: /service.*not supported/i, category: 'service_limitation', retryable: false },
+    { pattern: /ValidationException/i, category: 'service_limitation', retryable: false }
+]
+/**
+ * Approximate cost per hour for common instance families (USD).
+ * Used for budget estimation.
+ */
+const INSTANCE_COST_PER_HOUR: Record<string, number> = {
+    'g5': 1.21,
+    'g6': 0.98,
+    'g6e': 1.32,
+    'p4d': 32.77,
+    'p5': 65.00,
+    'trn2': 21.50,
+    'inf2': 1.58,
+    'ml.g5.xlarge': 1.21,
+    'ml.g5.2xlarge': 1.52,
+    'ml.g5.12xlarge': 7.09,
+    'ml.g5.48xlarge': 20.09
+}
+/**
+ * Estimated hours per prove run (generate+build+deploy+test+benchmark+clean).
+ */
+const ESTIMATED_HOURS_PER_PROVE = 1.5
+interface BrainEvent {
+    action: string
+    iteration?: number
+    budgetSpent?: number
+    maxProvesPerRun?: number
+    maxCostPerRun?: number
+    previousResults?: Array<Record<string, unknown>>
+    currentConfig?: Record<string, unknown>
+    lastResult?: string
+    classification?: Record<string, unknown>
+    error?: Record<string, unknown>
+    config?: Record<string, unknown>
+}
+interface BrainResponse {
+    done?: boolean
+    reason?: string
+    next?: Record<string, unknown>
+    tuneRequested?: boolean
+    iteration?: number
+    budgetSpent?: number
+    previousResults?: Array<Record<string, unknown>>
+}
+interface ClassificationResult {
+    stage: string
+    category: string
+    retryable: boolean
+}
+export async function handler(event: BrainEvent): Promise<BrainResponse | ClassificationResult> {
+    const action = event.action
+    switch (action) {
+        case 'getNextConfig':
+            return handleGetNextConfig(event)
+        case 'pickNext':
+            return handlePickNext(event)
+        case 'classifyFailure':
+            return handleClassifyFailure(event)
+        default:
+            throw new Error(`Unknown action: ${action}`)
+    }
+}
+/**
+ * getNextConfig: Called at the start of the state machine.
+ * Returns the first config to prove or {done: true} if nothing to do.
+ */
+function handleGetNextConfig(event: BrainEvent): BrainResponse {
+    const iteration = event.iteration ?? 0
+    const budgetSpent = event.budgetSpent ?? 0
+    const maxProvesPerRun = event.maxProvesPerRun ?? 10
+    const maxCostPerRun = event.maxCostPerRun ?? 100
+    // Check budget before starting
+    if (iteration >= maxProvesPerRun) {
+        return { done: true, reason: 'max_proves_reached' }
+    }
+    if (budgetSpent >= maxCostPerRun) {
+        return { done: true, reason: 'budget_exceeded' }
+    }
+    // In a real implementation, this would query Athena for gaps.
+    // The prove request configs come from the execution input's
+    // previousResults/gap list. For the state machine orchestration,
+    // the initial config is passed in the execution input.
+    const previousResults = event.previousResults ?? []
+    // If there's no work to do (no gaps identified), we're done
+    if (previousResults.length === 0 && iteration === 0) {
+        return { done: true, reason: 'all_gaps_filled' }
+    }
+    // Get next unproven config from the list
+    const nextConfig = getNextUnprovenConfig(previousResults, iteration)
+    if (!nextConfig) {
+        return { done: true, reason: 'all_gaps_filled' }
+    }
+    // Determine if tune stages are needed
+    const tuneRequested = shouldExecuteTuneStages(nextConfig)
+    return {
+        done: false,
+        next: nextConfig,
+        tuneRequested,
+        iteration: iteration + 1,
+        budgetSpent: budgetSpent + estimateCost(nextConfig)
+    }
+}
+/**
+ * pickNext: Called after a prove iteration (success or failure).
+ * Decides whether to continue or stop.
+ */
+function handlePickNext(event: BrainEvent): BrainResponse {
+    const iteration = event.iteration ?? 1
+    const budgetSpent = event.budgetSpent ?? 0
+    const maxProvesPerRun = event.maxProvesPerRun ?? 10
+    const maxCostPerRun = event.maxCostPerRun ?? 100
+    const previousResults = event.previousResults ?? []
+    // Update iteration count
+    const newIteration = iteration + 1
+    // Check budget controls
+    if (newIteration > maxProvesPerRun) {
+        return { done: true, reason: 'max_proves_reached' }
+    }
+    // Estimate cost of next prove and check budget
+    const nextConfig = getNextUnprovenConfig(previousResults, newIteration - 1)
+    if (!nextConfig) {
+        return { done: true, reason: 'all_gaps_filled' }
+    }
+    const estimatedNextCost = estimateCost(nextConfig)
+    if (budgetSpent + estimatedNextCost > maxCostPerRun) {
+        return { done: true, reason: 'budget_exceeded' }
+    }
+    // Determine if tune stages are needed
+    const tuneRequested = shouldExecuteTuneStages(nextConfig)
+    return {
+        done: false,
+        next: nextConfig,
+        tuneRequested,
+        iteration: newIteration,
+        budgetSpent: budgetSpent + estimatedNextCost,
+        previousResults
+    }
+}
+/**
+ * classifyFailure: Parse error output and classify into a category.
+ */
+function handleClassifyFailure(event: BrainEvent): ClassificationResult {
+    const error = event.error
+    if (!error) {
+        return { stage: 'unknown', category: 'code_bug', retryable: false }
+    }
+    // Extract error message
+    let errorMsg = ''
+    if (typeof error === 'string') {
+        errorMsg = error
+    } else {
+        errorMsg = (error as Record<string, string>).Cause
+            || (error as Record<string, string>).Error
+            || JSON.stringify(error)
+    }
+    // Detect stage
+    const stage = detectStage(errorMsg)
+    // Match against patterns
+    for (const { pattern, category, retryable } of ERROR_PATTERNS) {
+        if (pattern.test(errorMsg)) {
+            return { stage, category, retryable }
+        }
+    }
+    return { stage, category: 'code_bug', retryable: false }
+}
+/**
+ * Detect which lifecycle stage produced an error from the error message.
+ */
+function detectStage(errorMsg: string): string {
+    const stagePatterns: Array<{ pattern: RegExp; stage: string }> = [
+        { pattern: /\b(generate|generation)\b/i, stage: 'generate' },
+        { pattern: /\b(build|docker)\b/i, stage: 'build' },
+        { pattern: /\b(push|ecr|registry)\b/i, stage: 'push' },
+        { pattern: /\b(deploy|endpoint|CreateEndpoint)\b/i, stage: 'deploy' },
+        { pattern: /\b(test|invoke|invocation)\b/i, stage: 'test' },
+        { pattern: /\b(tune|fine-?tun|customization)\b/i, stage: 'tune' },
+        { pattern: /\b(adapter|lora)\b/i, stage: 'adapter' },
+        { pattern: /\b(benchmark|bench)\b/i, stage: 'benchmark' },
+        { pattern: /\b(clean|delete)\b/i, stage: 'clean' }
+    ]
+    for (const { pattern, stage } of stagePatterns) {
+        if (pattern.test(errorMsg)) {
+            return stage
+        }
+    }
+    return 'unknown'
+}
+/**
+ * Get the next unproven config from the list.
+ */
+function getNextUnprovenConfig(
+    configs: Array<Record<string, unknown>>,
+    index: number
+): Record<string, unknown> | null {
+    if (!configs || index >= configs.length) {
+        return null
+    }
+    return configs[index] ?? null
+}
+/**
+ * Determine whether tune/adapter stages should execute.
+ */
+function shouldExecuteTuneStages(config: Record<string, unknown>): boolean {
+    if (!config) return false
+    if (config.include_tuning === true) return true
+    if (config.enable_lora === true) return true
+    if (config.tune_technique && config.tune_technique !== 'none') return true
+    return false
+}
+/**
+ * Estimate cost of a prove run based on instance family.
+ */
+function estimateCost(config: Record<string, unknown>): number {
+    const instanceFamily = String(config.instance_family ?? 'g5')
+    const instanceType = String(config.instance_type ?? '')
+    // Try specific instance type first, then family
+    const costPerHour = INSTANCE_COST_PER_HOUR[instanceType]
+        ?? INSTANCE_COST_PER_HOUR[instanceFamily]
+        ?? 2.0 // Default fallback
+    return costPerHour * ESTIMATED_HOURS_PER_PROVE
+}

package/infra/ci-harness/lambda/path-prover/write-results.ts ADDED Viewed

@@ -0,0 +1,152 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Path Prover Write Results Lambda
+ *
+ * Writes benchmark results to Athena (via S3 Parquet) with run_type='path_prove'.
+ * Handles both success records and failure/unfeasible records.
+ *
+ * Requirements: 8.9, 8.10, 8.11, 8.12
+ */
+interface WriteResultsEvent {
+    action: string
+    config?: Record<string, unknown>
+    benchmarkResult?: Record<string, unknown>
+    error?: Record<string, unknown>
+    classification?: {
+        stage: string
+        category: string
+        retryable: boolean
+    }
+    runType: string
+}
+interface WriteResultsResponse {
+    success: boolean
+    recordId?: string
+    status?: string
+    error?: string
+}
+export async function handler(event: WriteResultsEvent): Promise<WriteResultsResponse> {
+    const action = event.action
+    switch (action) {
+        case 'writeResults':
+            return handleWriteResults(event)
+        case 'writeFailure':
+            return handleWriteFailure(event)
+        default:
+            throw new Error(`Unknown action: ${action}`)
+    }
+}
+/**
+ * Write a successful benchmark result to Athena.
+ * Sets status='completed', run_type='path_prove'.
+ */
+function handleWriteResults(event: WriteResultsEvent): WriteResultsResponse {
+    const config = event.config ?? {}
+    const runType = event.runType ?? 'path_prove'
+    const recordId = `pp-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+    // In production, this would:
+    // 1. Build a Parquet record from config + benchmarkResult
+    // 2. Write to S3 at the partitioned path
+    // 3. Register the partition in Glue
+    // For the orchestration, we confirm the record was built correctly.
+    const record = {
+        config_id: config.config_id ?? config.configId ?? recordId,
+        run_type: runType,
+        status: 'completed',
+        run_timestamp: new Date().toISOString(),
+        ...extractConfigDimensions(config)
+    }
+    // Validate the record has run_type='path_prove'
+    if (record.run_type !== 'path_prove') {
+        throw new Error(`Invalid run_type: expected 'path_prove', got '${record.run_type}'`)
+    }
+    return {
+        success: true,
+        recordId,
+        status: 'completed'
+    }
+}
+/**
+ * Write a failure record to Athena.
+ * Non-retryable failures get status='unfeasible'; retryable get status='failed'.
+ */
+function handleWriteFailure(event: WriteResultsEvent): WriteResultsResponse {
+    const config = event.config ?? {}
+    const classification = event.classification
+    const error = event.error
+    const runType = event.runType ?? 'path_prove'
+    const recordId = `pp-fail-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+    // Determine status based on classification
+    let status = 'failed'
+    if (classification && classification.retryable === false) {
+        status = 'unfeasible'
+    }
+    // Build failure reason
+    let failureReason = 'Unknown failure'
+    if (error) {
+        failureReason = typeof error === 'string'
+            ? error
+            : (error as Record<string, string>).Cause
+                || (error as Record<string, string>).Error
+                || JSON.stringify(error)
+    }
+    const record = {
+        config_id: config.config_id ?? config.configId ?? recordId,
+        run_type: runType,
+        status,
+        failure_reason: failureReason,
+        failure_stage: classification?.stage ?? 'unknown',
+        failure_category: classification?.category ?? 'code_bug',
+        failure_retryable: classification?.retryable ?? false,
+        run_timestamp: new Date().toISOString(),
+        ...extractConfigDimensions(config)
+    }
+    // Validate the record has run_type='path_prove'
+    if (record.run_type !== 'path_prove') {
+        throw new Error(`Invalid run_type: expected 'path_prove', got '${record.run_type}'`)
+    }
+    return {
+        success: true,
+        recordId,
+        status
+    }
+}
+/**
+ * Extract config dimensions from a config object.
+ */
+function extractConfigDimensions(config: Record<string, unknown>): Record<string, unknown> {
+    const dimensions: Record<string, unknown> = {}
+    const DIMS = [
+        'deployment_config', 'model_family', 'instance_family',
+        'quantization', 'tp_degree', 'deployment_target',
+        'model_name', 'instance_type'
+    ]
+    for (const dim of DIMS) {
+        if (config[dim] !== undefined) {
+            dimensions[dim] = config[dim]
+        }
+    }
+    return dimensions
+}