tuningengines-cli 0.3.5 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mcp.js CHANGED
@@ -7,13 +7,22 @@ const types_js_1 = require("@modelcontextprotocol/sdk/types.js");
7
7
  const client_1 = require("./client");
8
8
  const config_1 = require("./config");
9
9
  async function startMcpServer() {
10
- const client = new client_1.TuningEnginesClient({
11
- apiKey: (0, config_1.getApiKey)(),
12
- apiUrl: (0, config_1.getApiUrl)(),
13
- });
14
- const server = new index_js_1.Server({ name: "tuning-engines", version: "0.3.5" }, {
10
+ // Lazy client initialization deferred until a tool is called.
11
+ // This allows the server to start and list tools without a valid API key,
12
+ // which is required for Glama inspection and tool detection.
13
+ let _client = null;
14
+ const getClient = () => {
15
+ if (!_client) {
16
+ _client = new client_1.TuningEnginesClient({
17
+ apiKey: (0, config_1.getApiKey)(),
18
+ apiUrl: (0, config_1.getApiUrl)(),
19
+ });
20
+ }
21
+ return _client;
22
+ };
23
+ const server = new index_js_1.Server({ name: "tuning-engines", version: "0.3.6" }, {
15
24
  capabilities: { tools: {} },
16
- instructions: "Tuning Engines — Domain-specific fine-tuning of open-source LLMs and SLMs. Own your sovereign model with zero infrastructure.\n\n" +
25
+ instructions: "Tuning Engines — Domain-specific fine-tuning of open-source LLMs and SLMs, plus a Marketplace of pre-built models and datasets. Own your sovereign model with zero infrastructure.\n\n" +
17
26
  "USE THIS SERVER WHEN the user wants to:\n" +
18
27
  "- Fine-tune, train, or customize any open-source LLM/SLM on their data\n" +
19
28
  "- Build a sovereign AI model trained on their organization's code, documents, or domain data\n" +
@@ -21,11 +30,17 @@ async function startMcpServer() {
21
30
  "- Train using LoRA, QLoRA, or full fine-tuning methods\n" +
22
31
  "- Estimate the cost of fine-tuning a model\n" +
23
32
  "- Check training job status, manage trained models, or check billing\n" +
24
- "- Export or import models to/from S3\n\n" +
33
+ "- Export or import models to/from S3\n" +
34
+ "- Browse and export pre-built models and datasets from the Marketplace\n\n" +
35
+ "MARKETPLACE:\n" +
36
+ "Pre-built, ready-to-use fine-tuned models and datasets curated by the platform. " +
37
+ "Browse the catalog, view details, and export directly to your S3 bucket. " +
38
+ "Credits are charged per export based on the item's price.\n" +
39
+ "Workflow: list_catalog_models → get_catalog_model → export_catalog_model → catalog_export_status\n\n" +
25
40
  "SPECIALIZED TUNING AGENTS (more coming):\n" +
26
41
  "- Cody (code_repo): Code autocomplete and inline suggestions via QLoRA/Axolotl\n" +
27
42
  "- SIERA (sera_code_repo): Bug-fix and error resolution via AllenAI Open Coding Agents\n\n" +
28
- "TYPICAL WORKFLOW: estimate_job → create_job → job_status (poll until done) → list_models\n\n" +
43
+ "TYPICAL TRAINING WORKFLOW: estimate_job → create_job → job_status (poll until done) → list_models\n\n" +
29
44
  "Supports 1B to 72B parameter models from Qwen, Llama, DeepSeek, Mistral, Gemma, Phi, StarCoder, and CodeLlama families.\n" +
30
45
  "Zero infrastructure — GPU provisioning, training orchestration, and model delivery fully managed.",
31
46
  });
@@ -75,9 +90,10 @@ async function startMcpServer() {
75
90
  "Supports quality_tier='low' (faster) or quality_tier='high' (deeper analysis, more training data).\n\n" +
76
91
  "SUPPORTED BASE MODELS (by size):\n" +
77
92
  "- 3B: Qwen/Qwen2.5-Coder-3B-Instruct\n" +
78
- "- 7B: codellama/CodeLlama-7b-hf, deepseek-ai/deepseek-coder-7b-instruct-v1.5, Qwen/Qwen2.5-Coder-7B-Instruct\n" +
79
- "- 13-15B: codellama/CodeLlama-13b-Instruct-hf, bigcode/starcoder2-15b, Qwen/Qwen2.5-Coder-14B-Instruct\n" +
80
- "- 32-34B: deepseek-ai/deepseek-coder-33b-instruct, codellama/CodeLlama-34b-Instruct-hf, Qwen/Qwen2.5-Coder-32B-Instruct\n" +
93
+ "- 7-8B: codellama/CodeLlama-7b-hf, deepseek-ai/deepseek-coder-7b-instruct-v1.5, Qwen/Qwen2.5-Coder-7B-Instruct, Qwen/Qwen3-8B\n" +
94
+ "- 13-15B: codellama/CodeLlama-13b-Instruct-hf, bigcode/starcoder2-15b, Qwen/Qwen2.5-Coder-14B-Instruct, Qwen/Qwen3-14B\n" +
95
+ "- 22-27B: mistralai/Codestral-22B-v0.1, google/gemma-2-27b\n" +
96
+ "- 30-34B: deepseek-ai/deepseek-coder-33b-instruct, codellama/CodeLlama-34b-Instruct-hf, Qwen/Qwen2.5-Coder-32B-Instruct, Qwen/Qwen3-Coder-30B-A3B, Qwen/Qwen3-32B\n" +
81
97
  "- 70-72B: codellama/CodeLlama-70b-Instruct-hf, meta-llama/Llama-3.1-70B-Instruct, Qwen/Qwen2.5-72B-Instruct\n\n" +
82
98
  "TYPICAL WORKFLOW: estimate_job first to check cost, then create_job, then job_status to monitor progress.",
83
99
  inputSchema: {
@@ -167,7 +183,10 @@ async function startMcpServer() {
167
183
  },
168
184
  {
169
185
  name: "retry_job",
170
- description: "Retry a failed fine-tuning job from its last checkpoint. Creates a new job that resumes training where the failed one stopped, saving GPU time. Each retry is billed separately.",
186
+ description: "Retry a failed fine-tuning job from its last checkpoint. Creates a new job that resumes training where the failed one stopped, saving GPU time. Each retry is billed separately.\n\n" +
187
+ "IMPORTANT: This tool fetches a cost estimate and includes it in the response. " +
188
+ "You MUST show the estimate to the user and get their explicit approval before considering the retry confirmed. " +
189
+ "The retry is submitted automatically (the server validates balance), but always present the cost to the user.",
171
190
  inputSchema: {
172
191
  type: "object",
173
192
  properties: {
@@ -203,6 +222,10 @@ async function startMcpServer() {
203
222
  type: "number",
204
223
  description: "Approximate repository size in MB (helps refine the estimate)",
205
224
  },
225
+ use_case: {
226
+ type: "string",
227
+ description: "Agent to use for the estimate (e.g. 'code_repo' for Cody, 'sera_code_repo' for SIERA). Defaults to code_repo.",
228
+ },
206
229
  },
207
230
  },
208
231
  },
@@ -268,10 +291,15 @@ async function startMcpServer() {
268
291
  },
269
292
  {
270
293
  name: "list_supported_models",
271
- description: "List the supported base HuggingFace models available for fine-tuning on Tuning Engines.",
294
+ description: "List the supported base HuggingFace models available for fine-tuning on Tuning Engines. Optionally filter by agent to see only compatible models.",
272
295
  inputSchema: {
273
296
  type: "object",
274
- properties: {},
297
+ properties: {
298
+ agent: {
299
+ type: "string",
300
+ description: "Filter models compatible with this agent (e.g. 'code_repo', 'sera_code_repo'). Omit to see all models.",
301
+ },
302
+ },
275
303
  },
276
304
  },
277
305
  {
@@ -330,6 +358,285 @@ async function startMcpServer() {
330
358
  required: ["model_id"],
331
359
  },
332
360
  },
361
+ {
362
+ name: "list_catalog_models",
363
+ description: "List available pre-built models and datasets from the Tuning Engines Marketplace. " +
364
+ "These are platform-owned, ready-to-use assets that can be exported to your S3 bucket. " +
365
+ "Returns name, description, base model, size, export price, and category.",
366
+ inputSchema: {
367
+ type: "object",
368
+ properties: {
369
+ category: {
370
+ type: "string",
371
+ description: "Filter by category (e.g. 'code', 'bug-fix', 'general'). Omit to see all.",
372
+ },
373
+ },
374
+ },
375
+ },
376
+ {
377
+ name: "get_catalog_model",
378
+ description: "Get detailed information about a specific pre-built model or dataset from the Marketplace including description, pricing, and export options.",
379
+ inputSchema: {
380
+ type: "object",
381
+ properties: {
382
+ model_id: { type: "string", description: "Catalog model ID (UUID)" },
383
+ },
384
+ required: ["model_id"],
385
+ },
386
+ },
387
+ {
388
+ name: "export_catalog_model",
389
+ description: "Export a pre-built model or dataset from the Marketplace to your S3 bucket. " +
390
+ "Credits will be charged based on the export price upon successful completion.",
391
+ inputSchema: {
392
+ type: "object",
393
+ properties: {
394
+ model_id: { type: "string", description: "Catalog model ID (UUID) to export" },
395
+ s3_bucket: { type: "string", description: "Destination S3 bucket name" },
396
+ s3_prefix: {
397
+ type: "string",
398
+ description: "Optional S3 key prefix for the exported model",
399
+ },
400
+ s3_access_key_id: { type: "string", description: "AWS access key ID" },
401
+ s3_secret_access_key: { type: "string", description: "AWS secret access key" },
402
+ s3_region: { type: "string", description: "AWS region (e.g. us-east-1)" },
403
+ },
404
+ required: ["model_id", "s3_bucket", "s3_access_key_id", "s3_secret_access_key", "s3_region"],
405
+ },
406
+ },
407
+ {
408
+ name: "catalog_export_status",
409
+ description: "Check the status of a Marketplace export operation. Returns status, charge info, and any error messages.",
410
+ inputSchema: {
411
+ type: "object",
412
+ properties: {
413
+ model_id: { type: "string", description: "Catalog model ID (UUID)" },
414
+ export_id: { type: "string", description: "Export operation ID (UUID)" },
415
+ },
416
+ required: ["model_id", "export_id"],
417
+ },
418
+ },
419
+ // --- Datasets ---
420
+ {
421
+ name: "list_datasets",
422
+ description: "List datasets available for training and evaluation. Datasets can be uploaded from S3 and used for fine-tuning or model evaluation.",
423
+ inputSchema: {
424
+ type: "object",
425
+ properties: {
426
+ limit: { type: "number", description: "Max results (default 20)" },
427
+ },
428
+ },
429
+ },
430
+ {
431
+ name: "show_dataset",
432
+ description: "Get details of a specific dataset including status, source, and metadata.",
433
+ inputSchema: {
434
+ type: "object",
435
+ properties: {
436
+ dataset_id: { type: "string", description: "Dataset ID (UUID)" },
437
+ },
438
+ required: ["dataset_id"],
439
+ },
440
+ },
441
+ {
442
+ name: "create_dataset",
443
+ description: "Create a new dataset by importing from S3. Datasets can be used for fine-tuning or model evaluation.",
444
+ inputSchema: {
445
+ type: "object",
446
+ properties: {
447
+ name: { type: "string", description: "Name for the dataset" },
448
+ description: { type: "string", description: "Description of the dataset contents" },
449
+ source_type: { type: "string", description: "Source type (e.g. 's3')" },
450
+ s3_url: { type: "string", description: "S3 URL of the dataset (e.g. s3://bucket/path/data.jsonl)" },
451
+ s3_access_key_id: { type: "string", description: "AWS access key ID" },
452
+ s3_secret_access_key: { type: "string", description: "AWS secret access key" },
453
+ s3_region: { type: "string", description: "AWS region (e.g. us-east-1)" },
454
+ for_evaluation: { type: "boolean", description: "Whether this dataset is for evaluation (default: false)" },
455
+ },
456
+ required: ["name", "source_type"],
457
+ },
458
+ },
459
+ {
460
+ name: "delete_dataset",
461
+ description: "Delete a dataset from the platform.",
462
+ inputSchema: {
463
+ type: "object",
464
+ properties: {
465
+ dataset_id: { type: "string", description: "Dataset ID to delete" },
466
+ },
467
+ required: ["dataset_id"],
468
+ },
469
+ },
470
+ {
471
+ name: "dataset_status",
472
+ description: "Check the status of a dataset import or processing operation.",
473
+ inputSchema: {
474
+ type: "object",
475
+ properties: {
476
+ dataset_id: { type: "string", description: "Dataset ID (UUID)" },
477
+ },
478
+ required: ["dataset_id"],
479
+ },
480
+ },
481
+ // --- Evaluations ---
482
+ {
483
+ name: "list_evaluations",
484
+ description: "List model evaluations. Evaluations run your trained models against benchmark datasets using various evaluators to measure quality.",
485
+ inputSchema: {
486
+ type: "object",
487
+ properties: {
488
+ status: {
489
+ type: "string",
490
+ description: "Filter by status: queued, running, succeeded, failed, canceled",
491
+ },
492
+ limit: { type: "number", description: "Max results (default 20)" },
493
+ },
494
+ },
495
+ },
496
+ {
497
+ name: "show_evaluation",
498
+ description: "Get full details of a specific evaluation including status, scores, metrics, and comparison data.",
499
+ inputSchema: {
500
+ type: "object",
501
+ properties: {
502
+ evaluation_id: { type: "string", description: "Evaluation ID (UUID)" },
503
+ },
504
+ required: ["evaluation_id"],
505
+ },
506
+ },
507
+ {
508
+ name: "create_evaluation",
509
+ description: "Create a new model evaluation. Run your trained model or a base model against a dataset using selected evaluators. " +
510
+ "Use list_evaluators to see available evaluators (e.g. code_execution, similarity, llm_judge).",
511
+ inputSchema: {
512
+ type: "object",
513
+ properties: {
514
+ name: { type: "string", description: "Name for this evaluation run" },
515
+ user_model_id: {
516
+ type: "string",
517
+ description: "ID of your trained model to evaluate. Either this or base_model is required.",
518
+ },
519
+ base_model: {
520
+ type: "string",
521
+ description: "HuggingFace model ID to evaluate (e.g. 'Qwen/Qwen2.5-Coder-7B-Instruct'). Either this or user_model_id is required.",
522
+ },
523
+ dataset_id: {
524
+ type: "string",
525
+ description: "ID of the evaluation dataset to use. Must be a dataset marked for_evaluation.",
526
+ },
527
+ evaluator_ids: {
528
+ type: "array",
529
+ items: { type: "string" },
530
+ description: "List of evaluator IDs to run (use list_evaluators to see options)",
531
+ },
532
+ max_samples: {
533
+ type: "number",
534
+ description: "Maximum samples to evaluate (default: all)",
535
+ },
536
+ },
537
+ required: ["dataset_id", "evaluator_ids"],
538
+ },
539
+ },
540
+ {
541
+ name: "cancel_evaluation",
542
+ description: "Cancel a running or queued evaluation.",
543
+ inputSchema: {
544
+ type: "object",
545
+ properties: {
546
+ evaluation_id: { type: "string", description: "Evaluation ID to cancel" },
547
+ },
548
+ required: ["evaluation_id"],
549
+ },
550
+ },
551
+ {
552
+ name: "evaluation_status",
553
+ description: "Get live status of an evaluation including progress and current metrics.",
554
+ inputSchema: {
555
+ type: "object",
556
+ properties: {
557
+ evaluation_id: { type: "string", description: "Evaluation ID (UUID)" },
558
+ },
559
+ required: ["evaluation_id"],
560
+ },
561
+ },
562
+ {
563
+ name: "list_evaluators",
564
+ description: "List available evaluators for model evaluation. Evaluators measure different aspects of model quality like code execution, similarity, or LLM-based judgment.",
565
+ inputSchema: {
566
+ type: "object",
567
+ properties: {},
568
+ },
569
+ },
570
+ {
571
+ name: "estimate_evaluation",
572
+ description: "Get a cost estimate for an evaluation before running it.",
573
+ inputSchema: {
574
+ type: "object",
575
+ properties: {
576
+ user_model_id: { type: "string", description: "ID of your trained model" },
577
+ base_model: { type: "string", description: "Or a HuggingFace model ID" },
578
+ dataset_id: { type: "string", description: "Evaluation dataset ID" },
579
+ evaluator_ids: {
580
+ type: "array",
581
+ items: { type: "string" },
582
+ description: "List of evaluator IDs",
583
+ },
584
+ max_samples: { type: "number", description: "Max samples to evaluate" },
585
+ },
586
+ required: ["dataset_id", "evaluator_ids"],
587
+ },
588
+ },
589
+ // --- Inference ---
590
+ {
591
+ name: "list_inference_models",
592
+ description: "List models available for inference through the Tuning Engines inference API. " +
593
+ "Includes both platform models and your deployed trained models.",
594
+ inputSchema: {
595
+ type: "object",
596
+ properties: {},
597
+ },
598
+ },
599
+ {
600
+ name: "inference_usage",
601
+ description: "Get inference API usage statistics including request counts, token usage, and costs.",
602
+ inputSchema: {
603
+ type: "object",
604
+ properties: {
605
+ start_date: { type: "string", description: "Start date (YYYY-MM-DD)" },
606
+ end_date: { type: "string", description: "End date (YYYY-MM-DD)" },
607
+ model: { type: "string", description: "Filter by model name" },
608
+ },
609
+ },
610
+ },
611
+ {
612
+ name: "get_inference_jwt",
613
+ description: "Get a JWT token for authenticating with the Tuning Engines inference API. " +
614
+ "Use this to make direct API calls to the inference endpoint.",
615
+ inputSchema: {
616
+ type: "object",
617
+ properties: {},
618
+ },
619
+ },
620
+ // --- Agents ---
621
+ {
622
+ name: "list_agents",
623
+ description: "List available agents configured for your organization. Agents are AI assistants with specific capabilities and tool access.",
624
+ inputSchema: {
625
+ type: "object",
626
+ properties: {},
627
+ },
628
+ },
629
+ {
630
+ name: "show_agent",
631
+ description: "Get details of a specific agent including capabilities, tools, and configuration.",
632
+ inputSchema: {
633
+ type: "object",
634
+ properties: {
635
+ agent_id: { type: "string", description: "Agent ID" },
636
+ },
637
+ required: ["agent_id"],
638
+ },
639
+ },
333
640
  ],
334
641
  }));
335
642
  // Handle tool calls
@@ -339,13 +646,13 @@ async function startMcpServer() {
339
646
  let result;
340
647
  switch (name) {
341
648
  case "list_jobs":
342
- result = await client.listJobs({
649
+ result = await getClient().listJobs({
343
650
  status: args?.status,
344
651
  limit: args?.limit,
345
652
  });
346
653
  break;
347
654
  case "show_job":
348
- result = await client.getJob(args.job_id);
655
+ result = await getClient().getJob(args.job_id);
349
656
  break;
350
657
  case "create_job":
351
658
  if (!args?.base_model && !args?.base_user_model_id) {
@@ -354,7 +661,7 @@ async function startMcpServer() {
354
661
  isError: true,
355
662
  };
356
663
  }
357
- result = await client.createJob({
664
+ result = await getClient().createJob({
358
665
  base_model: args?.base_model,
359
666
  base_user_model_id: args?.base_user_model_id,
360
667
  output_name: args.output_name,
@@ -371,14 +678,34 @@ async function startMcpServer() {
371
678
  });
372
679
  break;
373
680
  case "cancel_job":
374
- result = await client.cancelJob(args.job_id);
681
+ result = await getClient().cancelJob(args.job_id);
375
682
  break;
376
683
  case "job_status":
377
- result = await client.getJobStatus(args.job_id);
684
+ result = await getClient().getJobStatus(args.job_id);
378
685
  break;
379
- case "retry_job":
380
- result = await client.retryJob(args.job_id, args?.github_token);
686
+ case "retry_job": {
687
+ // Fetch job details and estimate before retrying so the AI can show cost
688
+ const retryJobId = args.job_id;
689
+ const jobDetails = await getClient().getJob(retryJobId);
690
+ let retryEstimate = null;
691
+ try {
692
+ retryEstimate = await getClient().estimateJob({
693
+ base_model: jobDetails.base_model,
694
+ num_epochs: jobDetails.num_epochs,
695
+ max_examples: jobDetails.max_examples,
696
+ use_case: jobDetails.agent,
697
+ });
698
+ }
699
+ catch (estErr) {
700
+ // Estimate failed — continue with retry (server validates balance)
701
+ }
702
+ const retryResult = await getClient().retryJob(retryJobId, args?.github_token);
703
+ result = {
704
+ ...retryResult,
705
+ retry_estimate: retryEstimate,
706
+ };
381
707
  break;
708
+ }
382
709
  case "estimate_job":
383
710
  if (!args?.base_model && !args?.base_user_model_id) {
384
711
  return {
@@ -386,16 +713,17 @@ async function startMcpServer() {
386
713
  isError: true,
387
714
  };
388
715
  }
389
- result = await client.estimateJob({
716
+ result = await getClient().estimateJob({
390
717
  base_model: args?.base_model,
391
718
  base_user_model_id: args?.base_user_model_id,
392
719
  num_epochs: args?.num_epochs,
393
720
  max_examples: args?.max_examples,
394
721
  repo_size_mb: args?.repo_size_mb,
722
+ use_case: args?.use_case,
395
723
  });
396
724
  break;
397
725
  case "validate_s3":
398
- result = await client.validateS3({
726
+ result = await getClient().validateS3({
399
727
  s3_bucket: args.s3_bucket,
400
728
  s3_access_key_id: args.s3_access_key_id,
401
729
  s3_secret_access_key: args.s3_secret_access_key,
@@ -403,25 +731,25 @@ async function startMcpServer() {
403
731
  });
404
732
  break;
405
733
  case "list_models":
406
- result = await client.listUserModels();
734
+ result = await getClient().listUserModels();
407
735
  break;
408
736
  case "show_model":
409
- result = await client.getUserModel(args.model_id);
737
+ result = await getClient().getUserModel(args.model_id);
410
738
  break;
411
739
  case "delete_model":
412
- result = await client.deleteUserModel(args.model_id);
740
+ result = await getClient().deleteUserModel(args.model_id);
413
741
  break;
414
742
  case "get_balance":
415
- result = await client.getBilling();
743
+ result = await getClient().getBilling();
416
744
  break;
417
745
  case "get_account":
418
- result = await client.getAccount();
746
+ result = await getClient().getAccount();
419
747
  break;
420
748
  case "list_supported_models":
421
- result = await client.listModels();
749
+ result = await getClient().listModels({ agent: args?.agent });
422
750
  break;
423
751
  case "import_model":
424
- result = await client.importModel({
752
+ result = await getClient().importModel({
425
753
  name: args.name,
426
754
  source_s3_url: args.source_s3_url,
427
755
  base_model: args.base_model,
@@ -431,7 +759,7 @@ async function startMcpServer() {
431
759
  });
432
760
  break;
433
761
  case "export_model":
434
- result = await client.exportModel(args.model_id, {
762
+ result = await getClient().exportModel(args.model_id, {
435
763
  s3_bucket: args.s3_bucket,
436
764
  s3_prefix: args?.s3_prefix,
437
765
  s3_access_key_id: args.s3_access_key_id,
@@ -441,7 +769,119 @@ async function startMcpServer() {
441
769
  });
442
770
  break;
443
771
  case "model_status":
444
- result = await client.getUserModelStatus(args.model_id);
772
+ result = await getClient().getUserModelStatus(args.model_id);
773
+ break;
774
+ case "list_catalog_models":
775
+ result = await getClient().listCatalogModels({
776
+ category: args?.category,
777
+ });
778
+ break;
779
+ case "get_catalog_model":
780
+ result = await getClient().getCatalogModel(args.model_id);
781
+ break;
782
+ case "export_catalog_model":
783
+ result = await getClient().exportCatalogModel(args.model_id, {
784
+ s3_bucket: args.s3_bucket,
785
+ s3_prefix: args?.s3_prefix,
786
+ s3_access_key_id: args.s3_access_key_id,
787
+ s3_secret_access_key: args.s3_secret_access_key,
788
+ s3_region: args.s3_region,
789
+ });
790
+ break;
791
+ case "catalog_export_status":
792
+ result = await getClient().getCatalogExportStatus(args.model_id, args.export_id);
793
+ break;
794
+ // --- Datasets ---
795
+ case "list_datasets":
796
+ result = await getClient().listDatasets({
797
+ limit: args?.limit,
798
+ });
799
+ break;
800
+ case "show_dataset":
801
+ result = await getClient().getDataset(args.dataset_id);
802
+ break;
803
+ case "create_dataset":
804
+ result = await getClient().createDataset({
805
+ name: args.name,
806
+ description: args?.description,
807
+ source_type: args.source_type,
808
+ s3_url: args?.s3_url,
809
+ s3_access_key_id: args?.s3_access_key_id,
810
+ s3_secret_access_key: args?.s3_secret_access_key,
811
+ s3_region: args?.s3_region,
812
+ for_evaluation: args?.for_evaluation,
813
+ });
814
+ break;
815
+ case "delete_dataset":
816
+ result = await getClient().deleteDataset(args.dataset_id);
817
+ break;
818
+ case "dataset_status":
819
+ result = await getClient().getDatasetStatus(args.dataset_id);
820
+ break;
821
+ // --- Evaluations ---
822
+ case "list_evaluations":
823
+ result = await getClient().listEvaluations({
824
+ status: args?.status,
825
+ limit: args?.limit,
826
+ });
827
+ break;
828
+ case "show_evaluation":
829
+ result = await getClient().getEvaluation(args.evaluation_id);
830
+ break;
831
+ case "create_evaluation":
832
+ if (!args?.user_model_id && !args?.base_model) {
833
+ return {
834
+ content: [{ type: "text", text: "Error: either user_model_id or base_model is required" }],
835
+ isError: true,
836
+ };
837
+ }
838
+ result = await getClient().createEvaluation({
839
+ name: args?.name,
840
+ user_model_id: args?.user_model_id,
841
+ base_model: args?.base_model,
842
+ dataset_id: args.dataset_id,
843
+ evaluator_ids: args.evaluator_ids,
844
+ max_samples: args?.max_samples,
845
+ });
846
+ break;
847
+ case "cancel_evaluation":
848
+ result = await getClient().cancelEvaluation(args.evaluation_id);
849
+ break;
850
+ case "evaluation_status":
851
+ result = await getClient().getEvaluationStatus(args.evaluation_id);
852
+ break;
853
+ case "list_evaluators":
854
+ result = await getClient().listEvaluators();
855
+ break;
856
+ case "estimate_evaluation":
857
+ result = await getClient().estimateEvaluation({
858
+ user_model_id: args?.user_model_id,
859
+ base_model: args?.base_model,
860
+ dataset_id: args.dataset_id,
861
+ evaluator_ids: args.evaluator_ids,
862
+ max_samples: args?.max_samples,
863
+ });
864
+ break;
865
+ // --- Inference ---
866
+ case "list_inference_models":
867
+ result = await getClient().listInferenceModels();
868
+ break;
869
+ case "inference_usage":
870
+ result = await getClient().getInferenceUsage({
871
+ start_date: args?.start_date,
872
+ end_date: args?.end_date,
873
+ model: args?.model,
874
+ });
875
+ break;
876
+ case "get_inference_jwt":
877
+ result = await getClient().getInferenceJwt();
878
+ break;
879
+ // --- Agents ---
880
+ case "list_agents":
881
+ result = await getClient().listAgents();
882
+ break;
883
+ case "show_agent":
884
+ result = await getClient().getAgent(args.agent_id);
445
885
  break;
446
886
  default:
447
887
  return {