ray-mcp-server 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ray_mcp/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Ray MCP Server - Model Context Protocol server for Ray distributed computing."""
2
+
3
+ __version__ = "0.1.0"
4
+ __author__ = "claude 4 sonnet"
5
+ __email__ = "ray-mcp@example.com"
ray_mcp/main.py ADDED
@@ -0,0 +1,428 @@
1
+ #!/usr/bin/env python3
2
+ """Main entry point for the Ray MCP server."""
3
+
4
+ import asyncio
5
+ import json
6
+ import logging
7
+ import os
8
+ import sys
9
+ from typing import Any, Dict, List, Optional, Union
10
+
11
+ # Import MCP types
12
+ from mcp.server import Server
13
+ from mcp.server.stdio import stdio_server
14
+ from mcp.types import Content, EmbeddedResource, ImageContent, TextContent, Tool
15
+
16
+ # Import Ray modules with proper error handling
17
+ try:
18
+ import ray
19
+ from ray import job_submission
20
+
21
+ RAY_AVAILABLE = True
22
+ except ImportError:
23
+ RAY_AVAILABLE = False
24
+ ray = None
25
+ job_submission = None
26
+
27
+ from .ray_manager import RayManager
28
+ from .types import (
29
+ ActorConfig,
30
+ ActorId,
31
+ ActorInfo,
32
+ ActorState,
33
+ ClusterHealth,
34
+ ErrorResponse,
35
+ HealthStatus,
36
+ JobId,
37
+ JobInfo,
38
+ JobStatus,
39
+ JobSubmissionConfig,
40
+ NodeId,
41
+ NodeInfo,
42
+ PerformanceMetrics,
43
+ Response,
44
+ SuccessResponse,
45
+ )
46
+
47
+ # Initialize server and ray manager
48
+ server = Server("ray-mcp")
49
+ ray_manager = RayManager()
50
+
51
+ # Configure logging
52
+ logging.basicConfig(level=logging.INFO)
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ @server.list_tools()
57
+ async def list_tools() -> List[Tool]:
58
+ """List available Ray tools."""
59
+ return [
60
+ # Basic cluster management
61
+ Tool(
62
+ name="start_ray",
63
+ description="Start a new Ray cluster with head node and worker nodes (defaults to multi-node with 2 workers)",
64
+ inputSchema={
65
+ "type": "object",
66
+ "properties": {
67
+ "num_cpus": {
68
+ "type": "integer",
69
+ "minimum": 1,
70
+ "default": 1,
71
+ "description": "Number of CPUs for head node",
72
+ },
73
+ "num_gpus": {
74
+ "type": "integer",
75
+ "minimum": 0,
76
+ "description": "Number of GPUs for head node",
77
+ },
78
+ "object_store_memory": {
79
+ "type": "integer",
80
+ "minimum": 0,
81
+ "description": "Object store memory in bytes for head node",
82
+ },
83
+ "worker_nodes": {
84
+ "type": "array",
85
+ "description": "Configuration for worker nodes to start",
86
+ "items": {
87
+ "type": "object",
88
+ "properties": {
89
+ "num_cpus": {
90
+ "type": "integer",
91
+ "minimum": 1,
92
+ "description": "Number of CPUs for this worker node",
93
+ },
94
+ "num_gpus": {
95
+ "type": "integer",
96
+ "minimum": 0,
97
+ "description": "Number of GPUs for this worker node",
98
+ },
99
+ "object_store_memory": {
100
+ "type": "integer",
101
+ "minimum": 0,
102
+ "description": "Object store memory in bytes for this worker node",
103
+ },
104
+ "resources": {
105
+ "type": "object",
106
+ "description": "Additional custom resources for this worker node",
107
+ },
108
+ "node_name": {
109
+ "type": "string",
110
+ "description": "Optional name for this worker node",
111
+ },
112
+ },
113
+ "required": ["num_cpus"],
114
+ },
115
+ },
116
+ "head_node_port": {
117
+ "type": "integer",
118
+ "minimum": 10000,
119
+ "maximum": 65535,
120
+ "default": 10001,
121
+ "description": "Port for head node",
122
+ },
123
+ "dashboard_port": {
124
+ "type": "integer",
125
+ "minimum": 1000,
126
+ "maximum": 65535,
127
+ "default": 8265,
128
+ "description": "Port for Ray dashboard",
129
+ },
130
+ "head_node_host": {
131
+ "type": "string",
132
+ "default": "127.0.0.1",
133
+ "description": "Host address for head node",
134
+ },
135
+ },
136
+ },
137
+ ),
138
+ Tool(
139
+ name="connect_ray",
140
+ description="Connect to an existing Ray cluster",
141
+ inputSchema={
142
+ "type": "object",
143
+ "properties": {
144
+ "address": {
145
+ "type": "string",
146
+ "description": "Ray cluster address (e.g., 'ray://127.0.0.1:10001' or '127.0.0.1:10001')",
147
+ }
148
+ },
149
+ "required": ["address"],
150
+ },
151
+ ),
152
+ Tool(
153
+ name="stop_ray",
154
+ description="Stop the Ray cluster",
155
+ inputSchema={"type": "object", "properties": {}},
156
+ ),
157
+ Tool(
158
+ name="cluster_status",
159
+ description="Get Ray cluster status",
160
+ inputSchema={"type": "object", "properties": {}},
161
+ ),
162
+ Tool(
163
+ name="cluster_resources",
164
+ description="Get cluster resource information",
165
+ inputSchema={"type": "object", "properties": {}},
166
+ ),
167
+ Tool(
168
+ name="cluster_nodes",
169
+ description="Get cluster node information",
170
+ inputSchema={"type": "object", "properties": {}},
171
+ ),
172
+ Tool(
173
+ name="worker_status",
174
+ description="Get detailed status of worker nodes",
175
+ inputSchema={"type": "object", "properties": {}},
176
+ ),
177
+ # Job management
178
+ Tool(
179
+ name="submit_job",
180
+ description="Submit a job to the Ray cluster",
181
+ inputSchema={
182
+ "type": "object",
183
+ "properties": {
184
+ "entrypoint": {"type": "string"},
185
+ "runtime_env": {"type": "object"},
186
+ "job_id": {"type": "string"},
187
+ "metadata": {"type": "object"},
188
+ },
189
+ "required": ["entrypoint"],
190
+ },
191
+ ),
192
+ Tool(
193
+ name="list_jobs",
194
+ description="List all jobs in the cluster",
195
+ inputSchema={"type": "object", "properties": {}},
196
+ ),
197
+ Tool(
198
+ name="job_status",
199
+ description="Get the status of a specific job",
200
+ inputSchema={
201
+ "type": "object",
202
+ "properties": {"job_id": {"type": "string"}},
203
+ "required": ["job_id"],
204
+ },
205
+ ),
206
+ Tool(
207
+ name="cancel_job",
208
+ description="Cancel a running job",
209
+ inputSchema={
210
+ "type": "object",
211
+ "properties": {"job_id": {"type": "string"}},
212
+ "required": ["job_id"],
213
+ },
214
+ ),
215
+ Tool(
216
+ name="monitor_job",
217
+ description="Monitor job progress",
218
+ inputSchema={
219
+ "type": "object",
220
+ "properties": {"job_id": {"type": "string"}},
221
+ "required": ["job_id"],
222
+ },
223
+ ),
224
+ Tool(
225
+ name="debug_job",
226
+ description="Debug a job with detailed information",
227
+ inputSchema={
228
+ "type": "object",
229
+ "properties": {"job_id": {"type": "string"}},
230
+ "required": ["job_id"],
231
+ },
232
+ ),
233
+ # Actor management
234
+ Tool(
235
+ name="list_actors",
236
+ description="List all actors in the cluster",
237
+ inputSchema={
238
+ "type": "object",
239
+ "properties": {"filters": {"type": "object"}},
240
+ },
241
+ ),
242
+ Tool(
243
+ name="kill_actor",
244
+ description="Kill an actor",
245
+ inputSchema={
246
+ "type": "object",
247
+ "properties": {
248
+ "actor_id": {"type": "string"},
249
+ "no_restart": {"type": "boolean", "default": False},
250
+ },
251
+ "required": ["actor_id"],
252
+ },
253
+ ),
254
+ # Enhanced monitoring
255
+ Tool(
256
+ name="performance_metrics",
257
+ description="Get detailed cluster performance metrics",
258
+ inputSchema={"type": "object", "properties": {}},
259
+ ),
260
+ Tool(
261
+ name="health_check",
262
+ description="Perform comprehensive cluster health check",
263
+ inputSchema={"type": "object", "properties": {}},
264
+ ),
265
+ Tool(
266
+ name="optimize_config",
267
+ description="Get cluster optimization recommendations",
268
+ inputSchema={"type": "object", "properties": {}},
269
+ ),
270
+ Tool(
271
+ name="schedule_job",
272
+ description="Schedule a job to run periodically",
273
+ inputSchema={
274
+ "type": "object",
275
+ "properties": {
276
+ "entrypoint": {"type": "string"},
277
+ "schedule": {"type": "string"},
278
+ },
279
+ "required": ["entrypoint", "schedule"],
280
+ },
281
+ ),
282
+ # Logs & debugging
283
+ Tool(
284
+ name="get_logs",
285
+ description="Get logs from jobs, actors, or nodes",
286
+ inputSchema={
287
+ "type": "object",
288
+ "properties": {
289
+ "job_id": {"type": "string"},
290
+ "actor_id": {"type": "string"},
291
+ "node_id": {"type": "string"},
292
+ "num_lines": {"type": "integer", "minimum": 1, "default": 100},
293
+ },
294
+ },
295
+ ),
296
+ ]
297
+
298
+
299
+ @server.call_tool()
300
+ async def call_tool(
301
+ name: str, arguments: Optional[Dict[str, Any]] = None
302
+ ) -> List[TextContent]:
303
+ """Call a Ray tool."""
304
+ if not RAY_AVAILABLE:
305
+ return [
306
+ TextContent(
307
+ type="text",
308
+ text="Ray is not available. Please install Ray to use this MCP server.",
309
+ )
310
+ ]
311
+
312
+ args = arguments or {}
313
+
314
+ try:
315
+ # Basic cluster management
316
+ if name == "start_ray":
317
+ result = await ray_manager.start_cluster(**args)
318
+ elif name == "connect_ray":
319
+ result = await ray_manager.connect_cluster(**args)
320
+ elif name == "stop_ray":
321
+ result = await ray_manager.stop_cluster()
322
+ elif name == "cluster_status":
323
+ result = await ray_manager.get_cluster_status()
324
+ elif name == "cluster_resources":
325
+ result = await ray_manager.get_cluster_resources()
326
+ elif name == "cluster_nodes":
327
+ result = await ray_manager.get_cluster_nodes()
328
+ elif name == "worker_status":
329
+ result = await ray_manager.get_worker_status()
330
+
331
+ # Job management
332
+ elif name == "submit_job":
333
+ result = await ray_manager.submit_job(**args)
334
+ elif name == "list_jobs":
335
+ result = await ray_manager.list_jobs()
336
+ elif name == "job_status":
337
+ result = await ray_manager.get_job_status(args["job_id"])
338
+ elif name == "cancel_job":
339
+ result = await ray_manager.cancel_job(args["job_id"])
340
+ elif name == "monitor_job":
341
+ result = await ray_manager.monitor_job_progress(args["job_id"])
342
+ elif name == "debug_job":
343
+ result = await ray_manager.debug_job(args["job_id"])
344
+
345
+ # Actor management
346
+ elif name == "list_actors":
347
+ result = await ray_manager.list_actors(args.get("filters"))
348
+ elif name == "kill_actor":
349
+ result = await ray_manager.kill_actor(
350
+ args["actor_id"], args.get("no_restart", False)
351
+ )
352
+
353
+ # Enhanced monitoring
354
+ elif name == "performance_metrics":
355
+ result = await ray_manager.get_performance_metrics()
356
+ elif name == "health_check":
357
+ result = await ray_manager.cluster_health_check()
358
+ elif name == "optimize_config":
359
+ result = await ray_manager.optimize_cluster_config()
360
+
361
+ elif name == "schedule_job":
362
+ result = await ray_manager.schedule_job(**args)
363
+
364
+ # Logs & debugging
365
+ elif name == "get_logs":
366
+ result = await ray_manager.get_logs(**args)
367
+
368
+ else:
369
+ result = {"status": "error", "message": f"Unknown tool: {name}"}
370
+
371
+ return [TextContent(type="text", text=json.dumps(result, indent=2))]
372
+
373
+ except Exception as e:
374
+ logger.error(f"Error executing {name}: {e}")
375
+ return [
376
+ TextContent(
377
+ type="text",
378
+ text=json.dumps(
379
+ {"status": "error", "message": f"Error executing {name}: {str(e)}"},
380
+ indent=2,
381
+ ),
382
+ )
383
+ ]
384
+
385
+
386
+ async def main():
387
+ """Main entry point for the MCP server."""
388
+ if not RAY_AVAILABLE:
389
+ logger.error("Ray is not available. Please install Ray.")
390
+ sys.exit(1)
391
+
392
+ try:
393
+ # Check if RAY_ADDRESS environment variable is set for auto-connection
394
+ ray_address = os.environ.get("RAY_ADDRESS")
395
+ if ray_address:
396
+ print(f"Ray MCP Server starting with auto-connect to {ray_address}", file=sys.stderr)
397
+ try:
398
+ # Auto-connect to Ray cluster
399
+ await ray_manager.connect_cluster(address=ray_address)
400
+ print(f"✅ Auto-connected to Ray cluster at {ray_address}", file=sys.stderr)
401
+ except Exception as e:
402
+ print(f"⚠️ Auto-connect failed: {e}. Use connect_ray tool manually.", file=sys.stderr)
403
+ else:
404
+ print("Ray MCP Server starting (Ray not initialized yet)", file=sys.stderr)
405
+
406
+ async with stdio_server() as (read_stream, write_stream):
407
+ await server.run(
408
+ read_stream, write_stream, server.create_initialization_options()
409
+ )
410
+ except KeyboardInterrupt:
411
+ logger.info("Server interrupted by user")
412
+ except Exception as e:
413
+ logger.error(f"Server error: {e}")
414
+ sys.exit(1)
415
+ finally:
416
+ # Clean up Ray if it was initialized
417
+ if RAY_AVAILABLE and ray is not None and ray.is_initialized():
418
+ print("Shutting down Ray cluster", file=sys.stderr)
419
+ ray.shutdown()
420
+
421
+
422
+ def run_server():
423
+ """Synchronous entry point for console script."""
424
+ asyncio.run(main())
425
+
426
+
427
+ if __name__ == "__main__":
428
+ run_server()