ray-mcp-server 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ray_mcp/__init__.py +5 -0
- ray_mcp/main.py +428 -0
- ray_mcp/ray_manager.py +1079 -0
- ray_mcp/tools.py +191 -0
- ray_mcp/types.py +281 -0
- ray_mcp/worker_manager.py +241 -0
- ray_mcp_server-0.2.0.dist-info/METADATA +312 -0
- ray_mcp_server-0.2.0.dist-info/RECORD +12 -0
- ray_mcp_server-0.2.0.dist-info/WHEEL +4 -0
- ray_mcp_server-0.2.0.dist-info/entry_points.txt +2 -0
- ray_mcp_server-0.2.0.dist-info/licenses/LICENSE +200 -0
- ray_mcp_server-0.2.0.dist-info/licenses/NOTICE +40 -0
ray_mcp/__init__.py
ADDED
ray_mcp/main.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Main entry point for the Ray MCP server."""
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
|
10
|
+
|
|
11
|
+
# Import MCP types
|
|
12
|
+
from mcp.server import Server
|
|
13
|
+
from mcp.server.stdio import stdio_server
|
|
14
|
+
from mcp.types import Content, EmbeddedResource, ImageContent, TextContent, Tool
|
|
15
|
+
|
|
16
|
+
# Import Ray modules with proper error handling
|
|
17
|
+
try:
|
|
18
|
+
import ray
|
|
19
|
+
from ray import job_submission
|
|
20
|
+
|
|
21
|
+
RAY_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
RAY_AVAILABLE = False
|
|
24
|
+
ray = None
|
|
25
|
+
job_submission = None
|
|
26
|
+
|
|
27
|
+
from .ray_manager import RayManager
|
|
28
|
+
from .types import (
|
|
29
|
+
ActorConfig,
|
|
30
|
+
ActorId,
|
|
31
|
+
ActorInfo,
|
|
32
|
+
ActorState,
|
|
33
|
+
ClusterHealth,
|
|
34
|
+
ErrorResponse,
|
|
35
|
+
HealthStatus,
|
|
36
|
+
JobId,
|
|
37
|
+
JobInfo,
|
|
38
|
+
JobStatus,
|
|
39
|
+
JobSubmissionConfig,
|
|
40
|
+
NodeId,
|
|
41
|
+
NodeInfo,
|
|
42
|
+
PerformanceMetrics,
|
|
43
|
+
Response,
|
|
44
|
+
SuccessResponse,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Initialize server and ray manager
|
|
48
|
+
server = Server("ray-mcp")
|
|
49
|
+
ray_manager = RayManager()
|
|
50
|
+
|
|
51
|
+
# Configure logging
|
|
52
|
+
logging.basicConfig(level=logging.INFO)
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@server.list_tools()
|
|
57
|
+
async def list_tools() -> List[Tool]:
|
|
58
|
+
"""List available Ray tools."""
|
|
59
|
+
return [
|
|
60
|
+
# Basic cluster management
|
|
61
|
+
Tool(
|
|
62
|
+
name="start_ray",
|
|
63
|
+
description="Start a new Ray cluster with head node and worker nodes (defaults to multi-node with 2 workers)",
|
|
64
|
+
inputSchema={
|
|
65
|
+
"type": "object",
|
|
66
|
+
"properties": {
|
|
67
|
+
"num_cpus": {
|
|
68
|
+
"type": "integer",
|
|
69
|
+
"minimum": 1,
|
|
70
|
+
"default": 1,
|
|
71
|
+
"description": "Number of CPUs for head node",
|
|
72
|
+
},
|
|
73
|
+
"num_gpus": {
|
|
74
|
+
"type": "integer",
|
|
75
|
+
"minimum": 0,
|
|
76
|
+
"description": "Number of GPUs for head node",
|
|
77
|
+
},
|
|
78
|
+
"object_store_memory": {
|
|
79
|
+
"type": "integer",
|
|
80
|
+
"minimum": 0,
|
|
81
|
+
"description": "Object store memory in bytes for head node",
|
|
82
|
+
},
|
|
83
|
+
"worker_nodes": {
|
|
84
|
+
"type": "array",
|
|
85
|
+
"description": "Configuration for worker nodes to start",
|
|
86
|
+
"items": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"properties": {
|
|
89
|
+
"num_cpus": {
|
|
90
|
+
"type": "integer",
|
|
91
|
+
"minimum": 1,
|
|
92
|
+
"description": "Number of CPUs for this worker node",
|
|
93
|
+
},
|
|
94
|
+
"num_gpus": {
|
|
95
|
+
"type": "integer",
|
|
96
|
+
"minimum": 0,
|
|
97
|
+
"description": "Number of GPUs for this worker node",
|
|
98
|
+
},
|
|
99
|
+
"object_store_memory": {
|
|
100
|
+
"type": "integer",
|
|
101
|
+
"minimum": 0,
|
|
102
|
+
"description": "Object store memory in bytes for this worker node",
|
|
103
|
+
},
|
|
104
|
+
"resources": {
|
|
105
|
+
"type": "object",
|
|
106
|
+
"description": "Additional custom resources for this worker node",
|
|
107
|
+
},
|
|
108
|
+
"node_name": {
|
|
109
|
+
"type": "string",
|
|
110
|
+
"description": "Optional name for this worker node",
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
"required": ["num_cpus"],
|
|
114
|
+
},
|
|
115
|
+
},
|
|
116
|
+
"head_node_port": {
|
|
117
|
+
"type": "integer",
|
|
118
|
+
"minimum": 10000,
|
|
119
|
+
"maximum": 65535,
|
|
120
|
+
"default": 10001,
|
|
121
|
+
"description": "Port for head node",
|
|
122
|
+
},
|
|
123
|
+
"dashboard_port": {
|
|
124
|
+
"type": "integer",
|
|
125
|
+
"minimum": 1000,
|
|
126
|
+
"maximum": 65535,
|
|
127
|
+
"default": 8265,
|
|
128
|
+
"description": "Port for Ray dashboard",
|
|
129
|
+
},
|
|
130
|
+
"head_node_host": {
|
|
131
|
+
"type": "string",
|
|
132
|
+
"default": "127.0.0.1",
|
|
133
|
+
"description": "Host address for head node",
|
|
134
|
+
},
|
|
135
|
+
},
|
|
136
|
+
},
|
|
137
|
+
),
|
|
138
|
+
Tool(
|
|
139
|
+
name="connect_ray",
|
|
140
|
+
description="Connect to an existing Ray cluster",
|
|
141
|
+
inputSchema={
|
|
142
|
+
"type": "object",
|
|
143
|
+
"properties": {
|
|
144
|
+
"address": {
|
|
145
|
+
"type": "string",
|
|
146
|
+
"description": "Ray cluster address (e.g., 'ray://127.0.0.1:10001' or '127.0.0.1:10001')",
|
|
147
|
+
}
|
|
148
|
+
},
|
|
149
|
+
"required": ["address"],
|
|
150
|
+
},
|
|
151
|
+
),
|
|
152
|
+
Tool(
|
|
153
|
+
name="stop_ray",
|
|
154
|
+
description="Stop the Ray cluster",
|
|
155
|
+
inputSchema={"type": "object", "properties": {}},
|
|
156
|
+
),
|
|
157
|
+
Tool(
|
|
158
|
+
name="cluster_status",
|
|
159
|
+
description="Get Ray cluster status",
|
|
160
|
+
inputSchema={"type": "object", "properties": {}},
|
|
161
|
+
),
|
|
162
|
+
Tool(
|
|
163
|
+
name="cluster_resources",
|
|
164
|
+
description="Get cluster resource information",
|
|
165
|
+
inputSchema={"type": "object", "properties": {}},
|
|
166
|
+
),
|
|
167
|
+
Tool(
|
|
168
|
+
name="cluster_nodes",
|
|
169
|
+
description="Get cluster node information",
|
|
170
|
+
inputSchema={"type": "object", "properties": {}},
|
|
171
|
+
),
|
|
172
|
+
Tool(
|
|
173
|
+
name="worker_status",
|
|
174
|
+
description="Get detailed status of worker nodes",
|
|
175
|
+
inputSchema={"type": "object", "properties": {}},
|
|
176
|
+
),
|
|
177
|
+
# Job management
|
|
178
|
+
Tool(
|
|
179
|
+
name="submit_job",
|
|
180
|
+
description="Submit a job to the Ray cluster",
|
|
181
|
+
inputSchema={
|
|
182
|
+
"type": "object",
|
|
183
|
+
"properties": {
|
|
184
|
+
"entrypoint": {"type": "string"},
|
|
185
|
+
"runtime_env": {"type": "object"},
|
|
186
|
+
"job_id": {"type": "string"},
|
|
187
|
+
"metadata": {"type": "object"},
|
|
188
|
+
},
|
|
189
|
+
"required": ["entrypoint"],
|
|
190
|
+
},
|
|
191
|
+
),
|
|
192
|
+
Tool(
|
|
193
|
+
name="list_jobs",
|
|
194
|
+
description="List all jobs in the cluster",
|
|
195
|
+
inputSchema={"type": "object", "properties": {}},
|
|
196
|
+
),
|
|
197
|
+
Tool(
|
|
198
|
+
name="job_status",
|
|
199
|
+
description="Get the status of a specific job",
|
|
200
|
+
inputSchema={
|
|
201
|
+
"type": "object",
|
|
202
|
+
"properties": {"job_id": {"type": "string"}},
|
|
203
|
+
"required": ["job_id"],
|
|
204
|
+
},
|
|
205
|
+
),
|
|
206
|
+
Tool(
|
|
207
|
+
name="cancel_job",
|
|
208
|
+
description="Cancel a running job",
|
|
209
|
+
inputSchema={
|
|
210
|
+
"type": "object",
|
|
211
|
+
"properties": {"job_id": {"type": "string"}},
|
|
212
|
+
"required": ["job_id"],
|
|
213
|
+
},
|
|
214
|
+
),
|
|
215
|
+
Tool(
|
|
216
|
+
name="monitor_job",
|
|
217
|
+
description="Monitor job progress",
|
|
218
|
+
inputSchema={
|
|
219
|
+
"type": "object",
|
|
220
|
+
"properties": {"job_id": {"type": "string"}},
|
|
221
|
+
"required": ["job_id"],
|
|
222
|
+
},
|
|
223
|
+
),
|
|
224
|
+
Tool(
|
|
225
|
+
name="debug_job",
|
|
226
|
+
description="Debug a job with detailed information",
|
|
227
|
+
inputSchema={
|
|
228
|
+
"type": "object",
|
|
229
|
+
"properties": {"job_id": {"type": "string"}},
|
|
230
|
+
"required": ["job_id"],
|
|
231
|
+
},
|
|
232
|
+
),
|
|
233
|
+
# Actor management
|
|
234
|
+
Tool(
|
|
235
|
+
name="list_actors",
|
|
236
|
+
description="List all actors in the cluster",
|
|
237
|
+
inputSchema={
|
|
238
|
+
"type": "object",
|
|
239
|
+
"properties": {"filters": {"type": "object"}},
|
|
240
|
+
},
|
|
241
|
+
),
|
|
242
|
+
Tool(
|
|
243
|
+
name="kill_actor",
|
|
244
|
+
description="Kill an actor",
|
|
245
|
+
inputSchema={
|
|
246
|
+
"type": "object",
|
|
247
|
+
"properties": {
|
|
248
|
+
"actor_id": {"type": "string"},
|
|
249
|
+
"no_restart": {"type": "boolean", "default": False},
|
|
250
|
+
},
|
|
251
|
+
"required": ["actor_id"],
|
|
252
|
+
},
|
|
253
|
+
),
|
|
254
|
+
# Enhanced monitoring
|
|
255
|
+
Tool(
|
|
256
|
+
name="performance_metrics",
|
|
257
|
+
description="Get detailed cluster performance metrics",
|
|
258
|
+
inputSchema={"type": "object", "properties": {}},
|
|
259
|
+
),
|
|
260
|
+
Tool(
|
|
261
|
+
name="health_check",
|
|
262
|
+
description="Perform comprehensive cluster health check",
|
|
263
|
+
inputSchema={"type": "object", "properties": {}},
|
|
264
|
+
),
|
|
265
|
+
Tool(
|
|
266
|
+
name="optimize_config",
|
|
267
|
+
description="Get cluster optimization recommendations",
|
|
268
|
+
inputSchema={"type": "object", "properties": {}},
|
|
269
|
+
),
|
|
270
|
+
Tool(
|
|
271
|
+
name="schedule_job",
|
|
272
|
+
description="Schedule a job to run periodically",
|
|
273
|
+
inputSchema={
|
|
274
|
+
"type": "object",
|
|
275
|
+
"properties": {
|
|
276
|
+
"entrypoint": {"type": "string"},
|
|
277
|
+
"schedule": {"type": "string"},
|
|
278
|
+
},
|
|
279
|
+
"required": ["entrypoint", "schedule"],
|
|
280
|
+
},
|
|
281
|
+
),
|
|
282
|
+
# Logs & debugging
|
|
283
|
+
Tool(
|
|
284
|
+
name="get_logs",
|
|
285
|
+
description="Get logs from jobs, actors, or nodes",
|
|
286
|
+
inputSchema={
|
|
287
|
+
"type": "object",
|
|
288
|
+
"properties": {
|
|
289
|
+
"job_id": {"type": "string"},
|
|
290
|
+
"actor_id": {"type": "string"},
|
|
291
|
+
"node_id": {"type": "string"},
|
|
292
|
+
"num_lines": {"type": "integer", "minimum": 1, "default": 100},
|
|
293
|
+
},
|
|
294
|
+
},
|
|
295
|
+
),
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@server.call_tool()
|
|
300
|
+
async def call_tool(
|
|
301
|
+
name: str, arguments: Optional[Dict[str, Any]] = None
|
|
302
|
+
) -> List[TextContent]:
|
|
303
|
+
"""Call a Ray tool."""
|
|
304
|
+
if not RAY_AVAILABLE:
|
|
305
|
+
return [
|
|
306
|
+
TextContent(
|
|
307
|
+
type="text",
|
|
308
|
+
text="Ray is not available. Please install Ray to use this MCP server.",
|
|
309
|
+
)
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
args = arguments or {}
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
# Basic cluster management
|
|
316
|
+
if name == "start_ray":
|
|
317
|
+
result = await ray_manager.start_cluster(**args)
|
|
318
|
+
elif name == "connect_ray":
|
|
319
|
+
result = await ray_manager.connect_cluster(**args)
|
|
320
|
+
elif name == "stop_ray":
|
|
321
|
+
result = await ray_manager.stop_cluster()
|
|
322
|
+
elif name == "cluster_status":
|
|
323
|
+
result = await ray_manager.get_cluster_status()
|
|
324
|
+
elif name == "cluster_resources":
|
|
325
|
+
result = await ray_manager.get_cluster_resources()
|
|
326
|
+
elif name == "cluster_nodes":
|
|
327
|
+
result = await ray_manager.get_cluster_nodes()
|
|
328
|
+
elif name == "worker_status":
|
|
329
|
+
result = await ray_manager.get_worker_status()
|
|
330
|
+
|
|
331
|
+
# Job management
|
|
332
|
+
elif name == "submit_job":
|
|
333
|
+
result = await ray_manager.submit_job(**args)
|
|
334
|
+
elif name == "list_jobs":
|
|
335
|
+
result = await ray_manager.list_jobs()
|
|
336
|
+
elif name == "job_status":
|
|
337
|
+
result = await ray_manager.get_job_status(args["job_id"])
|
|
338
|
+
elif name == "cancel_job":
|
|
339
|
+
result = await ray_manager.cancel_job(args["job_id"])
|
|
340
|
+
elif name == "monitor_job":
|
|
341
|
+
result = await ray_manager.monitor_job_progress(args["job_id"])
|
|
342
|
+
elif name == "debug_job":
|
|
343
|
+
result = await ray_manager.debug_job(args["job_id"])
|
|
344
|
+
|
|
345
|
+
# Actor management
|
|
346
|
+
elif name == "list_actors":
|
|
347
|
+
result = await ray_manager.list_actors(args.get("filters"))
|
|
348
|
+
elif name == "kill_actor":
|
|
349
|
+
result = await ray_manager.kill_actor(
|
|
350
|
+
args["actor_id"], args.get("no_restart", False)
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Enhanced monitoring
|
|
354
|
+
elif name == "performance_metrics":
|
|
355
|
+
result = await ray_manager.get_performance_metrics()
|
|
356
|
+
elif name == "health_check":
|
|
357
|
+
result = await ray_manager.cluster_health_check()
|
|
358
|
+
elif name == "optimize_config":
|
|
359
|
+
result = await ray_manager.optimize_cluster_config()
|
|
360
|
+
|
|
361
|
+
elif name == "schedule_job":
|
|
362
|
+
result = await ray_manager.schedule_job(**args)
|
|
363
|
+
|
|
364
|
+
# Logs & debugging
|
|
365
|
+
elif name == "get_logs":
|
|
366
|
+
result = await ray_manager.get_logs(**args)
|
|
367
|
+
|
|
368
|
+
else:
|
|
369
|
+
result = {"status": "error", "message": f"Unknown tool: {name}"}
|
|
370
|
+
|
|
371
|
+
return [TextContent(type="text", text=json.dumps(result, indent=2))]
|
|
372
|
+
|
|
373
|
+
except Exception as e:
|
|
374
|
+
logger.error(f"Error executing {name}: {e}")
|
|
375
|
+
return [
|
|
376
|
+
TextContent(
|
|
377
|
+
type="text",
|
|
378
|
+
text=json.dumps(
|
|
379
|
+
{"status": "error", "message": f"Error executing {name}: {str(e)}"},
|
|
380
|
+
indent=2,
|
|
381
|
+
),
|
|
382
|
+
)
|
|
383
|
+
]
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
async def main():
|
|
387
|
+
"""Main entry point for the MCP server."""
|
|
388
|
+
if not RAY_AVAILABLE:
|
|
389
|
+
logger.error("Ray is not available. Please install Ray.")
|
|
390
|
+
sys.exit(1)
|
|
391
|
+
|
|
392
|
+
try:
|
|
393
|
+
# Check if RAY_ADDRESS environment variable is set for auto-connection
|
|
394
|
+
ray_address = os.environ.get("RAY_ADDRESS")
|
|
395
|
+
if ray_address:
|
|
396
|
+
print(f"Ray MCP Server starting with auto-connect to {ray_address}", file=sys.stderr)
|
|
397
|
+
try:
|
|
398
|
+
# Auto-connect to Ray cluster
|
|
399
|
+
await ray_manager.connect_cluster(address=ray_address)
|
|
400
|
+
print(f"✅ Auto-connected to Ray cluster at {ray_address}", file=sys.stderr)
|
|
401
|
+
except Exception as e:
|
|
402
|
+
print(f"⚠️ Auto-connect failed: {e}. Use connect_ray tool manually.", file=sys.stderr)
|
|
403
|
+
else:
|
|
404
|
+
print("Ray MCP Server starting (Ray not initialized yet)", file=sys.stderr)
|
|
405
|
+
|
|
406
|
+
async with stdio_server() as (read_stream, write_stream):
|
|
407
|
+
await server.run(
|
|
408
|
+
read_stream, write_stream, server.create_initialization_options()
|
|
409
|
+
)
|
|
410
|
+
except KeyboardInterrupt:
|
|
411
|
+
logger.info("Server interrupted by user")
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.error(f"Server error: {e}")
|
|
414
|
+
sys.exit(1)
|
|
415
|
+
finally:
|
|
416
|
+
# Clean up Ray if it was initialized
|
|
417
|
+
if RAY_AVAILABLE and ray is not None and ray.is_initialized():
|
|
418
|
+
print("Shutting down Ray cluster", file=sys.stderr)
|
|
419
|
+
ray.shutdown()
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def run_server():
|
|
423
|
+
"""Synchronous entry point for console script."""
|
|
424
|
+
asyncio.run(main())
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
if __name__ == "__main__":
|
|
428
|
+
run_server()
|