npm - tuningengines-cli - Versions diffs - 0.3.5 → 0.4.0 - Mend

tuningengines-cli 0.3.5 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/LICENSE +21 -0
package/README.md +103 -1
package/dist/cli.js +3 -1
package/dist/cli.js.map +1 -1
package/dist/client.d.ts +78 -1
package/dist/client.d.ts.map +1 -1
package/dist/client.js +111 -2
package/dist/client.js.map +1 -1
package/dist/commands/catalog.d.ts +4 -0
package/dist/commands/catalog.d.ts.map +1 -0
package/dist/commands/catalog.js +164 -0
package/dist/commands/catalog.js.map +1 -0
package/dist/commands/jobs.d.ts.map +1 -1
package/dist/commands/jobs.js +34 -1
package/dist/commands/jobs.js.map +1 -1
package/dist/commands/models.d.ts.map +1 -1
package/dist/commands/models.js +2 -1
package/dist/commands/models.js.map +1 -1
package/dist/mcp.d.ts.map +1 -1
package/dist/mcp.js +472 -32
package/dist/mcp.js.map +1 -1
package/package.json +2 -2

package/dist/mcp.js CHANGED Viewed

@@ -7,13 +7,22 @@ const types_js_1 = require("@modelcontextprotocol/sdk/types.js");
 const client_1 = require("./client");
 const config_1 = require("./config");
 async function startMcpServer() {
-    const client = new client_1.TuningEnginesClient({
-        apiKey: (0, config_1.getApiKey)(),
-        apiUrl: (0, config_1.getApiUrl)(),
-    });
-    const server = new index_js_1.Server({ name: "tuning-engines", version: "0.3.5" }, {
+    // Lazy client initialization — deferred until a tool is called.
+    // This allows the server to start and list tools without a valid API key,
+    // which is required for Glama inspection and tool detection.
+    let _client = null;
+    const getClient = () => {
+        if (!_client) {
+            _client = new client_1.TuningEnginesClient({
+                apiKey: (0, config_1.getApiKey)(),
+                apiUrl: (0, config_1.getApiUrl)(),
+            });
+        }
+        return _client;
+    };
+    const server = new index_js_1.Server({ name: "tuning-engines", version: "0.3.6" }, {
         capabilities: { tools: {} },
-        instructions: "Tuning Engines — Domain-specific fine-tuning of open-source LLMs and SLMs. Own your sovereign model with zero infrastructure.\n\n" +
+        instructions: "Tuning Engines — Domain-specific fine-tuning of open-source LLMs and SLMs, plus a Marketplace of pre-built models and datasets. Own your sovereign model with zero infrastructure.\n\n" +
             "USE THIS SERVER WHEN the user wants to:\n" +
             "- Fine-tune, train, or customize any open-source LLM/SLM on their data\n" +
             "- Build a sovereign AI model trained on their organization's code, documents, or domain data\n" +
@@ -21,11 +30,17 @@ async function startMcpServer() {
             "- Train using LoRA, QLoRA, or full fine-tuning methods\n" +
             "- Estimate the cost of fine-tuning a model\n" +
             "- Check training job status, manage trained models, or check billing\n" +
-            "- Export or import models to/from S3\n\n" +
+            "- Export or import models to/from S3\n" +
+            "- Browse and export pre-built models and datasets from the Marketplace\n\n" +
+            "MARKETPLACE:\n" +
+            "Pre-built, ready-to-use fine-tuned models and datasets curated by the platform. " +
+            "Browse the catalog, view details, and export directly to your S3 bucket. " +
+            "Credits are charged per export based on the item's price.\n" +
+            "Workflow: list_catalog_models → get_catalog_model → export_catalog_model → catalog_export_status\n\n" +
             "SPECIALIZED TUNING AGENTS (more coming):\n" +
             "- Cody (code_repo): Code autocomplete and inline suggestions via QLoRA/Axolotl\n" +
             "- SIERA (sera_code_repo): Bug-fix and error resolution via AllenAI Open Coding Agents\n\n" +
-            "TYPICAL WORKFLOW: estimate_job → create_job → job_status (poll until done) → list_models\n\n" +
+            "TYPICAL TRAINING WORKFLOW: estimate_job → create_job → job_status (poll until done) → list_models\n\n" +
             "Supports 1B to 72B parameter models from Qwen, Llama, DeepSeek, Mistral, Gemma, Phi, StarCoder, and CodeLlama families.\n" +
             "Zero infrastructure — GPU provisioning, training orchestration, and model delivery fully managed.",
     });
@@ -75,9 +90,10 @@ async function startMcpServer() {
                     "Supports quality_tier='low' (faster) or quality_tier='high' (deeper analysis, more training data).\n\n" +
                     "SUPPORTED BASE MODELS (by size):\n" +
                     "- 3B: Qwen/Qwen2.5-Coder-3B-Instruct\n" +
-                    "- 7B: codellama/CodeLlama-7b-hf, deepseek-ai/deepseek-coder-7b-instruct-v1.5, Qwen/Qwen2.5-Coder-7B-Instruct\n" +
-                    "- 13-15B: codellama/CodeLlama-13b-Instruct-hf, bigcode/starcoder2-15b, Qwen/Qwen2.5-Coder-14B-Instruct\n" +
-                    "- 32-34B: deepseek-ai/deepseek-coder-33b-instruct, codellama/CodeLlama-34b-Instruct-hf, Qwen/Qwen2.5-Coder-32B-Instruct\n" +
+                    "- 7-8B: codellama/CodeLlama-7b-hf, deepseek-ai/deepseek-coder-7b-instruct-v1.5, Qwen/Qwen2.5-Coder-7B-Instruct, Qwen/Qwen3-8B\n" +
+                    "- 13-15B: codellama/CodeLlama-13b-Instruct-hf, bigcode/starcoder2-15b, Qwen/Qwen2.5-Coder-14B-Instruct, Qwen/Qwen3-14B\n" +
+                    "- 22-27B: mistralai/Codestral-22B-v0.1, google/gemma-2-27b\n" +
+                    "- 30-34B: deepseek-ai/deepseek-coder-33b-instruct, codellama/CodeLlama-34b-Instruct-hf, Qwen/Qwen2.5-Coder-32B-Instruct, Qwen/Qwen3-Coder-30B-A3B, Qwen/Qwen3-32B\n" +
                     "- 70-72B: codellama/CodeLlama-70b-Instruct-hf, meta-llama/Llama-3.1-70B-Instruct, Qwen/Qwen2.5-72B-Instruct\n\n" +
                     "TYPICAL WORKFLOW: estimate_job first to check cost, then create_job, then job_status to monitor progress.",
                 inputSchema: {
@@ -167,7 +183,10 @@ async function startMcpServer() {
             },
             {
                 name: "retry_job",
-                description: "Retry a failed fine-tuning job from its last checkpoint. Creates a new job that resumes training where the failed one stopped, saving GPU time. Each retry is billed separately.",
+                description: "Retry a failed fine-tuning job from its last checkpoint. Creates a new job that resumes training where the failed one stopped, saving GPU time. Each retry is billed separately.\n\n" +
+                    "IMPORTANT: This tool fetches a cost estimate and includes it in the response. " +
+                    "You MUST show the estimate to the user and get their explicit approval before considering the retry confirmed. " +
+                    "The retry is submitted automatically (the server validates balance), but always present the cost to the user.",
                 inputSchema: {
                     type: "object",
                     properties: {
@@ -203,6 +222,10 @@ async function startMcpServer() {
                             type: "number",
                             description: "Approximate repository size in MB (helps refine the estimate)",
                         },
+                        use_case: {
+                            type: "string",
+                            description: "Agent to use for the estimate (e.g. 'code_repo' for Cody, 'sera_code_repo' for SIERA). Defaults to code_repo.",
+                        },
                     },
                 },
             },
@@ -268,10 +291,15 @@ async function startMcpServer() {
             },
             {
                 name: "list_supported_models",
-                description: "List the supported base HuggingFace models available for fine-tuning on Tuning Engines.",
+                description: "List the supported base HuggingFace models available for fine-tuning on Tuning Engines. Optionally filter by agent to see only compatible models.",
                 inputSchema: {
                     type: "object",
-                    properties: {},
+                    properties: {
+                        agent: {
+                            type: "string",
+                            description: "Filter models compatible with this agent (e.g. 'code_repo', 'sera_code_repo'). Omit to see all models.",
+                        },
+                    },
                 },
             },
             {
@@ -330,6 +358,285 @@ async function startMcpServer() {
                     required: ["model_id"],
                 },
             },
+            {
+                name: "list_catalog_models",
+                description: "List available pre-built models and datasets from the Tuning Engines Marketplace. " +
+                    "These are platform-owned, ready-to-use assets that can be exported to your S3 bucket. " +
+                    "Returns name, description, base model, size, export price, and category.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        category: {
+                            type: "string",
+                            description: "Filter by category (e.g. 'code', 'bug-fix', 'general'). Omit to see all.",
+                        },
+                    },
+                },
+            },
+            {
+                name: "get_catalog_model",
+                description: "Get detailed information about a specific pre-built model or dataset from the Marketplace including description, pricing, and export options.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        model_id: { type: "string", description: "Catalog model ID (UUID)" },
+                    },
+                    required: ["model_id"],
+                },
+            },
+            {
+                name: "export_catalog_model",
+                description: "Export a pre-built model or dataset from the Marketplace to your S3 bucket. " +
+                    "Credits will be charged based on the export price upon successful completion.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        model_id: { type: "string", description: "Catalog model ID (UUID) to export" },
+                        s3_bucket: { type: "string", description: "Destination S3 bucket name" },
+                        s3_prefix: {
+                            type: "string",
+                            description: "Optional S3 key prefix for the exported model",
+                        },
+                        s3_access_key_id: { type: "string", description: "AWS access key ID" },
+                        s3_secret_access_key: { type: "string", description: "AWS secret access key" },
+                        s3_region: { type: "string", description: "AWS region (e.g. us-east-1)" },
+                    },
+                    required: ["model_id", "s3_bucket", "s3_access_key_id", "s3_secret_access_key", "s3_region"],
+                },
+            },
+            {
+                name: "catalog_export_status",
+                description: "Check the status of a Marketplace export operation. Returns status, charge info, and any error messages.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        model_id: { type: "string", description: "Catalog model ID (UUID)" },
+                        export_id: { type: "string", description: "Export operation ID (UUID)" },
+                    },
+                    required: ["model_id", "export_id"],
+                },
+            },
+            // --- Datasets ---
+            {
+                name: "list_datasets",
+                description: "List datasets available for training and evaluation. Datasets can be uploaded from S3 and used for fine-tuning or model evaluation.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        limit: { type: "number", description: "Max results (default 20)" },
+                    },
+                },
+            },
+            {
+                name: "show_dataset",
+                description: "Get details of a specific dataset including status, source, and metadata.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        dataset_id: { type: "string", description: "Dataset ID (UUID)" },
+                    },
+                    required: ["dataset_id"],
+                },
+            },
+            {
+                name: "create_dataset",
+                description: "Create a new dataset by importing from S3. Datasets can be used for fine-tuning or model evaluation.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        name: { type: "string", description: "Name for the dataset" },
+                        description: { type: "string", description: "Description of the dataset contents" },
+                        source_type: { type: "string", description: "Source type (e.g. 's3')" },
+                        s3_url: { type: "string", description: "S3 URL of the dataset (e.g. s3://bucket/path/data.jsonl)" },
+                        s3_access_key_id: { type: "string", description: "AWS access key ID" },
+                        s3_secret_access_key: { type: "string", description: "AWS secret access key" },
+                        s3_region: { type: "string", description: "AWS region (e.g. us-east-1)" },
+                        for_evaluation: { type: "boolean", description: "Whether this dataset is for evaluation (default: false)" },
+                    },
+                    required: ["name", "source_type"],
+                },
+            },
+            {
+                name: "delete_dataset",
+                description: "Delete a dataset from the platform.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        dataset_id: { type: "string", description: "Dataset ID to delete" },
+                    },
+                    required: ["dataset_id"],
+                },
+            },
+            {
+                name: "dataset_status",
+                description: "Check the status of a dataset import or processing operation.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        dataset_id: { type: "string", description: "Dataset ID (UUID)" },
+                    },
+                    required: ["dataset_id"],
+                },
+            },
+            // --- Evaluations ---
+            {
+                name: "list_evaluations",
+                description: "List model evaluations. Evaluations run your trained models against benchmark datasets using various evaluators to measure quality.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        status: {
+                            type: "string",
+                            description: "Filter by status: queued, running, succeeded, failed, canceled",
+                        },
+                        limit: { type: "number", description: "Max results (default 20)" },
+                    },
+                },
+            },
+            {
+                name: "show_evaluation",
+                description: "Get full details of a specific evaluation including status, scores, metrics, and comparison data.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        evaluation_id: { type: "string", description: "Evaluation ID (UUID)" },
+                    },
+                    required: ["evaluation_id"],
+                },
+            },
+            {
+                name: "create_evaluation",
+                description: "Create a new model evaluation. Run your trained model or a base model against a dataset using selected evaluators. " +
+                    "Use list_evaluators to see available evaluators (e.g. code_execution, similarity, llm_judge).",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        name: { type: "string", description: "Name for this evaluation run" },
+                        user_model_id: {
+                            type: "string",
+                            description: "ID of your trained model to evaluate. Either this or base_model is required.",
+                        },
+                        base_model: {
+                            type: "string",
+                            description: "HuggingFace model ID to evaluate (e.g. 'Qwen/Qwen2.5-Coder-7B-Instruct'). Either this or user_model_id is required.",
+                        },
+                        dataset_id: {
+                            type: "string",
+                            description: "ID of the evaluation dataset to use. Must be a dataset marked for_evaluation.",
+                        },
+                        evaluator_ids: {
+                            type: "array",
+                            items: { type: "string" },
+                            description: "List of evaluator IDs to run (use list_evaluators to see options)",
+                        },
+                        max_samples: {
+                            type: "number",
+                            description: "Maximum samples to evaluate (default: all)",
+                        },
+                    },
+                    required: ["dataset_id", "evaluator_ids"],
+                },
+            },
+            {
+                name: "cancel_evaluation",
+                description: "Cancel a running or queued evaluation.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        evaluation_id: { type: "string", description: "Evaluation ID to cancel" },
+                    },
+                    required: ["evaluation_id"],
+                },
+            },
+            {
+                name: "evaluation_status",
+                description: "Get live status of an evaluation including progress and current metrics.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        evaluation_id: { type: "string", description: "Evaluation ID (UUID)" },
+                    },
+                    required: ["evaluation_id"],
+                },
+            },
+            {
+                name: "list_evaluators",
+                description: "List available evaluators for model evaluation. Evaluators measure different aspects of model quality like code execution, similarity, or LLM-based judgment.",
+                inputSchema: {
+                    type: "object",
+                    properties: {},
+                },
+            },
+            {
+                name: "estimate_evaluation",
+                description: "Get a cost estimate for an evaluation before running it.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        user_model_id: { type: "string", description: "ID of your trained model" },
+                        base_model: { type: "string", description: "Or a HuggingFace model ID" },
+                        dataset_id: { type: "string", description: "Evaluation dataset ID" },
+                        evaluator_ids: {
+                            type: "array",
+                            items: { type: "string" },
+                            description: "List of evaluator IDs",
+                        },
+                        max_samples: { type: "number", description: "Max samples to evaluate" },
+                    },
+                    required: ["dataset_id", "evaluator_ids"],
+                },
+            },
+            // --- Inference ---
+            {
+                name: "list_inference_models",
+                description: "List models available for inference through the Tuning Engines inference API. " +
+                    "Includes both platform models and your deployed trained models.",
+                inputSchema: {
+                    type: "object",
+                    properties: {},
+                },
+            },
+            {
+                name: "inference_usage",
+                description: "Get inference API usage statistics including request counts, token usage, and costs.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        start_date: { type: "string", description: "Start date (YYYY-MM-DD)" },
+                        end_date: { type: "string", description: "End date (YYYY-MM-DD)" },
+                        model: { type: "string", description: "Filter by model name" },
+                    },
+                },
+            },
+            {
+                name: "get_inference_jwt",
+                description: "Get a JWT token for authenticating with the Tuning Engines inference API. " +
+                    "Use this to make direct API calls to the inference endpoint.",
+                inputSchema: {
+                    type: "object",
+                    properties: {},
+                },
+            },
+            // --- Agents ---
+            {
+                name: "list_agents",
+                description: "List available agents configured for your organization. Agents are AI assistants with specific capabilities and tool access.",
+                inputSchema: {
+                    type: "object",
+                    properties: {},
+                },
+            },
+            {
+                name: "show_agent",
+                description: "Get details of a specific agent including capabilities, tools, and configuration.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        agent_id: { type: "string", description: "Agent ID" },
+                    },
+                    required: ["agent_id"],
+                },
+            },
         ],
     }));
     // Handle tool calls
@@ -339,13 +646,13 @@ async function startMcpServer() {
             let result;
             switch (name) {
                 case "list_jobs":
-                    result = await client.listJobs({
+                    result = await getClient().listJobs({
                         status: args?.status,
                         limit: args?.limit,
                     });
                     break;
                 case "show_job":
-                    result = await client.getJob(args.job_id);
+                    result = await getClient().getJob(args.job_id);
                     break;
                 case "create_job":
                     if (!args?.base_model && !args?.base_user_model_id) {
@@ -354,7 +661,7 @@ async function startMcpServer() {
                             isError: true,
                         };
                     }
-                    result = await client.createJob({
+                    result = await getClient().createJob({
                         base_model: args?.base_model,
                         base_user_model_id: args?.base_user_model_id,
                         output_name: args.output_name,
@@ -371,14 +678,34 @@ async function startMcpServer() {
                     });
                     break;
                 case "cancel_job":
-                    result = await client.cancelJob(args.job_id);
+                    result = await getClient().cancelJob(args.job_id);
                     break;
                 case "job_status":
-                    result = await client.getJobStatus(args.job_id);
+                    result = await getClient().getJobStatus(args.job_id);
                     break;
-                case "retry_job":
-                    result = await client.retryJob(args.job_id, args?.github_token);
+                case "retry_job": {
+                    // Fetch job details and estimate before retrying so the AI can show cost
+                    const retryJobId = args.job_id;
+                    const jobDetails = await getClient().getJob(retryJobId);
+                    let retryEstimate = null;
+                    try {
+                        retryEstimate = await getClient().estimateJob({
+                            base_model: jobDetails.base_model,
+                            num_epochs: jobDetails.num_epochs,
+                            max_examples: jobDetails.max_examples,
+                            use_case: jobDetails.agent,
+                        });
+                    }
+                    catch (estErr) {
+                        // Estimate failed — continue with retry (server validates balance)
+                    }
+                    const retryResult = await getClient().retryJob(retryJobId, args?.github_token);
+                    result = {
+                        ...retryResult,
+                        retry_estimate: retryEstimate,
+                    };
                     break;
+                }
                 case "estimate_job":
                     if (!args?.base_model && !args?.base_user_model_id) {
                         return {
@@ -386,16 +713,17 @@ async function startMcpServer() {
                             isError: true,
                         };
                     }
-                    result = await client.estimateJob({
+                    result = await getClient().estimateJob({
                         base_model: args?.base_model,
                         base_user_model_id: args?.base_user_model_id,
                         num_epochs: args?.num_epochs,
                         max_examples: args?.max_examples,
                         repo_size_mb: args?.repo_size_mb,
+                        use_case: args?.use_case,
                     });
                     break;
                 case "validate_s3":
-                    result = await client.validateS3({
+                    result = await getClient().validateS3({
                         s3_bucket: args.s3_bucket,
                         s3_access_key_id: args.s3_access_key_id,
                         s3_secret_access_key: args.s3_secret_access_key,
@@ -403,25 +731,25 @@ async function startMcpServer() {
                     });
                     break;
                 case "list_models":
-                    result = await client.listUserModels();
+                    result = await getClient().listUserModels();
                     break;
                 case "show_model":
-                    result = await client.getUserModel(args.model_id);
+                    result = await getClient().getUserModel(args.model_id);
                     break;
                 case "delete_model":
-                    result = await client.deleteUserModel(args.model_id);
+                    result = await getClient().deleteUserModel(args.model_id);
                     break;
                 case "get_balance":
-                    result = await client.getBilling();
+                    result = await getClient().getBilling();
                     break;
                 case "get_account":
-                    result = await client.getAccount();
+                    result = await getClient().getAccount();
                     break;
                 case "list_supported_models":
-                    result = await client.listModels();
+                    result = await getClient().listModels({ agent: args?.agent });
                     break;
                 case "import_model":
-                    result = await client.importModel({
+                    result = await getClient().importModel({
                         name: args.name,
                         source_s3_url: args.source_s3_url,
                         base_model: args.base_model,
@@ -431,7 +759,7 @@ async function startMcpServer() {
                     });
                     break;
                 case "export_model":
-                    result = await client.exportModel(args.model_id, {
+                    result = await getClient().exportModel(args.model_id, {
                         s3_bucket: args.s3_bucket,
                         s3_prefix: args?.s3_prefix,
                         s3_access_key_id: args.s3_access_key_id,
@@ -441,7 +769,119 @@ async function startMcpServer() {
                     });
                     break;
                 case "model_status":
-                    result = await client.getUserModelStatus(args.model_id);
+                    result = await getClient().getUserModelStatus(args.model_id);
+                    break;
+                case "list_catalog_models":
+                    result = await getClient().listCatalogModels({
+                        category: args?.category,
+                    });
+                    break;
+                case "get_catalog_model":
+                    result = await getClient().getCatalogModel(args.model_id);
+                    break;
+                case "export_catalog_model":
+                    result = await getClient().exportCatalogModel(args.model_id, {
+                        s3_bucket: args.s3_bucket,
+                        s3_prefix: args?.s3_prefix,
+                        s3_access_key_id: args.s3_access_key_id,
+                        s3_secret_access_key: args.s3_secret_access_key,
+                        s3_region: args.s3_region,
+                    });
+                    break;
+                case "catalog_export_status":
+                    result = await getClient().getCatalogExportStatus(args.model_id, args.export_id);
+                    break;
+                // --- Datasets ---
+                case "list_datasets":
+                    result = await getClient().listDatasets({
+                        limit: args?.limit,
+                    });
+                    break;
+                case "show_dataset":
+                    result = await getClient().getDataset(args.dataset_id);
+                    break;
+                case "create_dataset":
+                    result = await getClient().createDataset({
+                        name: args.name,
+                        description: args?.description,
+                        source_type: args.source_type,
+                        s3_url: args?.s3_url,
+                        s3_access_key_id: args?.s3_access_key_id,
+                        s3_secret_access_key: args?.s3_secret_access_key,
+                        s3_region: args?.s3_region,
+                        for_evaluation: args?.for_evaluation,
+                    });
+                    break;
+                case "delete_dataset":
+                    result = await getClient().deleteDataset(args.dataset_id);
+                    break;
+                case "dataset_status":
+                    result = await getClient().getDatasetStatus(args.dataset_id);
+                    break;
+                // --- Evaluations ---
+                case "list_evaluations":
+                    result = await getClient().listEvaluations({
+                        status: args?.status,
+                        limit: args?.limit,
+                    });
+                    break;
+                case "show_evaluation":
+                    result = await getClient().getEvaluation(args.evaluation_id);
+                    break;
+                case "create_evaluation":
+                    if (!args?.user_model_id && !args?.base_model) {
+                        return {
+                            content: [{ type: "text", text: "Error: either user_model_id or base_model is required" }],
+                            isError: true,
+                        };
+                    }
+                    result = await getClient().createEvaluation({
+                        name: args?.name,
+                        user_model_id: args?.user_model_id,
+                        base_model: args?.base_model,
+                        dataset_id: args.dataset_id,
+                        evaluator_ids: args.evaluator_ids,
+                        max_samples: args?.max_samples,
+                    });
+                    break;
+                case "cancel_evaluation":
+                    result = await getClient().cancelEvaluation(args.evaluation_id);
+                    break;
+                case "evaluation_status":
+                    result = await getClient().getEvaluationStatus(args.evaluation_id);
+                    break;
+                case "list_evaluators":
+                    result = await getClient().listEvaluators();
+                    break;
+                case "estimate_evaluation":
+                    result = await getClient().estimateEvaluation({
+                        user_model_id: args?.user_model_id,
+                        base_model: args?.base_model,
+                        dataset_id: args.dataset_id,
+                        evaluator_ids: args.evaluator_ids,
+                        max_samples: args?.max_samples,
+                    });
+                    break;
+                // --- Inference ---
+                case "list_inference_models":
+                    result = await getClient().listInferenceModels();
+                    break;
+                case "inference_usage":
+                    result = await getClient().getInferenceUsage({
+                        start_date: args?.start_date,
+                        end_date: args?.end_date,
+                        model: args?.model,
+                    });
+                    break;
+                case "get_inference_jwt":
+                    result = await getClient().getInferenceJwt();
+                    break;
+                // --- Agents ---
+                case "list_agents":
+                    result = await getClient().listAgents();
+                    break;
+                case "show_agent":
+                    result = await getClient().getAgent(args.agent_id);
                     break;
                 default:
                     return {