xoscar 0.7.16__cp310-cp310-macosx_11_0_arm64.whl → 0.8.0__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xoscar might be problematic. Click here for more details.
- xoscar/_utils.cpython-310-darwin.so +0 -0
- xoscar/backends/communication/socket.py +4 -4
- xoscar/backends/core.py +57 -9
- xoscar/backends/indigen/pool.py +59 -15
- xoscar/backends/message.cpython-310-darwin.so +0 -0
- xoscar/backends/pool.py +19 -14
- xoscar/backends/test/pool.py +37 -1
- xoscar/collective/backend/__init__.py +13 -0
- xoscar/collective/backend/nccl_backend.py +160 -0
- xoscar/collective/xoscar_pygloo.cpython-310-darwin.so +0 -0
- xoscar/context.cpython-310-darwin.so +0 -0
- xoscar/core.cpython-310-darwin.so +0 -0
- xoscar/serialization/core.cpython-310-darwin.so +0 -0
- xoscar/utils.py +5 -0
- xoscar/virtualenv/platform.py +4 -1
- {xoscar-0.7.16.dist-info → xoscar-0.8.0.dist-info}/METADATA +2 -1
- {xoscar-0.7.16.dist-info → xoscar-0.8.0.dist-info}/RECORD +19 -17
- {xoscar-0.7.16.dist-info → xoscar-0.8.0.dist-info}/WHEEL +0 -0
- {xoscar-0.7.16.dist-info → xoscar-0.8.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -31,7 +31,7 @@ from urllib.parse import urlparse
|
|
|
31
31
|
from ..._utils import to_binary
|
|
32
32
|
from ...constants import XOSCAR_CONNECT_TIMEOUT, XOSCAR_UNIX_SOCKET_DIR
|
|
33
33
|
from ...serialization import AioDeserializer, AioSerializer, deserialize
|
|
34
|
-
from ...utils import classproperty, implements, is_py_312, is_v6_ip
|
|
34
|
+
from ...utils import classproperty, implements, is_py_312, is_py_312_or_above, is_v6_ip
|
|
35
35
|
from .base import Channel, ChannelType, Client, Server
|
|
36
36
|
from .core import register_client, register_server
|
|
37
37
|
from .errors import ChannelClosed
|
|
@@ -192,9 +192,9 @@ class _BaseSocketServer(Server, metaclass=ABCMeta):
|
|
|
192
192
|
@implements(Server.stop)
|
|
193
193
|
async def stop(self):
|
|
194
194
|
self._aio_server.close()
|
|
195
|
-
# Python 3.12
|
|
196
|
-
# `wait_closed` leads to hang
|
|
197
|
-
if not
|
|
195
|
+
# Python 3.12+: # https://github.com/python/cpython/issues/104344
|
|
196
|
+
# `wait_closed` leads to hang in Python 3.12 and 3.13
|
|
197
|
+
if not is_py_312_or_above():
|
|
198
198
|
await self._aio_server.wait_closed()
|
|
199
199
|
# close all channels
|
|
200
200
|
await asyncio.gather(
|
xoscar/backends/core.py
CHANGED
|
@@ -244,7 +244,27 @@ def _cancel_all_tasks(loop):
|
|
|
244
244
|
for task in to_cancel:
|
|
245
245
|
task.cancel()
|
|
246
246
|
|
|
247
|
-
|
|
247
|
+
# In Python 3.13+, we need to use a different approach to avoid deadlocks
|
|
248
|
+
# when shutting down event loops in threads
|
|
249
|
+
if hasattr(asyncio, "run"):
|
|
250
|
+
# For Python 3.13+, use a more robust approach
|
|
251
|
+
async def _gather_cancelled():
|
|
252
|
+
await asyncio.gather(*to_cancel, return_exceptions=True)
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
# Try to run the gather in the current loop context
|
|
256
|
+
if loop.is_running():
|
|
257
|
+
# If loop is running, schedule the gather
|
|
258
|
+
asyncio.run_coroutine_threadsafe(_gather_cancelled(), loop)
|
|
259
|
+
else:
|
|
260
|
+
# If loop is not running, we can run it directly
|
|
261
|
+
loop.run_until_complete(_gather_cancelled())
|
|
262
|
+
except RuntimeError:
|
|
263
|
+
# If we can't run the gather, just log and continue
|
|
264
|
+
logger.debug("Could not gather cancelled tasks during shutdown")
|
|
265
|
+
else:
|
|
266
|
+
# For older Python versions, use the original approach
|
|
267
|
+
loop.run_until_complete(asyncio.gather(*to_cancel, return_exceptions=True))
|
|
248
268
|
|
|
249
269
|
for task in to_cancel:
|
|
250
270
|
if task.cancelled():
|
|
@@ -263,8 +283,15 @@ def _safe_run_forever(loop):
|
|
|
263
283
|
try:
|
|
264
284
|
loop.run_forever()
|
|
265
285
|
finally:
|
|
266
|
-
|
|
267
|
-
|
|
286
|
+
try:
|
|
287
|
+
_cancel_all_tasks(loop)
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.debug("Error during task cancellation: %s", e)
|
|
290
|
+
finally:
|
|
291
|
+
try:
|
|
292
|
+
loop.stop()
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.debug("Error stopping loop: %s", e)
|
|
268
295
|
|
|
269
296
|
|
|
270
297
|
class ActorCaller:
|
|
@@ -273,12 +300,31 @@ class ActorCaller:
|
|
|
273
300
|
class _RefHolder:
|
|
274
301
|
pass
|
|
275
302
|
|
|
276
|
-
_close_loop =
|
|
277
|
-
_close_thread =
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
303
|
+
_close_loop = None
|
|
304
|
+
_close_thread = None
|
|
305
|
+
_initialized = False
|
|
306
|
+
|
|
307
|
+
@classmethod
|
|
308
|
+
def _ensure_initialized(cls):
|
|
309
|
+
if not cls._initialized:
|
|
310
|
+
cls._close_loop = asyncio.new_event_loop()
|
|
311
|
+
cls._close_thread = threading.Thread(
|
|
312
|
+
target=_safe_run_forever, args=(cls._close_loop,), daemon=True
|
|
313
|
+
)
|
|
314
|
+
cls._close_thread.start()
|
|
315
|
+
atexit.register(cls._cleanup)
|
|
316
|
+
cls._initialized = True
|
|
317
|
+
|
|
318
|
+
@classmethod
|
|
319
|
+
def _cleanup(cls):
|
|
320
|
+
if cls._close_loop and cls._close_loop.is_running():
|
|
321
|
+
try:
|
|
322
|
+
cls._close_loop.call_soon_threadsafe(cls._close_loop.stop)
|
|
323
|
+
# Give the loop a moment to stop
|
|
324
|
+
if cls._close_thread:
|
|
325
|
+
cls._close_thread.join(timeout=0.5) # Shorter timeout for tests
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logger.debug("Error during cleanup: %s", e)
|
|
282
328
|
|
|
283
329
|
def __init__(self):
|
|
284
330
|
self._thread_local = threading.local()
|
|
@@ -294,6 +340,8 @@ class ActorCaller:
|
|
|
294
340
|
# If the thread exit, we clean the related actor callers and channels.
|
|
295
341
|
|
|
296
342
|
def _cleanup():
|
|
343
|
+
self._ensure_initialized()
|
|
344
|
+
# Use the background thread for cleanup
|
|
297
345
|
asyncio.run_coroutine_threadsafe(actor_caller.stop(), self._close_loop)
|
|
298
346
|
logger.debug(
|
|
299
347
|
"Clean up the actor caller due to thread exit: %s", thread_info
|
xoscar/backends/indigen/pool.py
CHANGED
|
@@ -418,26 +418,70 @@ class MainActorPool(MainActorPoolBase):
|
|
|
418
418
|
async def kill_sub_pool(
|
|
419
419
|
self, process: asyncio.subprocess.Process, force: bool = False
|
|
420
420
|
):
|
|
421
|
+
# First, try to terminate the process gracefully
|
|
422
|
+
if not force:
|
|
423
|
+
try:
|
|
424
|
+
process.terminate()
|
|
425
|
+
# Wait for graceful termination
|
|
426
|
+
try:
|
|
427
|
+
await asyncio.wait_for(process.wait(), timeout=2.0)
|
|
428
|
+
except asyncio.TimeoutError:
|
|
429
|
+
# Process didn't terminate gracefully, force kill
|
|
430
|
+
force = True
|
|
431
|
+
except ProcessLookupError:
|
|
432
|
+
# Process already terminated
|
|
433
|
+
pass
|
|
434
|
+
|
|
435
|
+
# Force kill if needed or if graceful termination failed
|
|
436
|
+
if force:
|
|
437
|
+
try:
|
|
438
|
+
process.kill()
|
|
439
|
+
except ProcessLookupError:
|
|
440
|
+
# Process already dead
|
|
441
|
+
pass
|
|
442
|
+
|
|
443
|
+
# Ensure process is completely terminated and cleaned up
|
|
421
444
|
try:
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
445
|
+
# Wait for process to complete
|
|
446
|
+
if process.returncode is None:
|
|
447
|
+
try:
|
|
448
|
+
await asyncio.wait_for(process.wait(), timeout=5.0)
|
|
449
|
+
except asyncio.TimeoutError:
|
|
450
|
+
pass
|
|
451
|
+
except ProcessLookupError:
|
|
452
|
+
# Process already terminated
|
|
453
|
+
pass
|
|
425
454
|
|
|
426
|
-
|
|
427
|
-
|
|
455
|
+
# Python 3.13 specific cleanup for waitpid threads
|
|
456
|
+
if sys.version_info >= (3, 13):
|
|
428
457
|
try:
|
|
429
|
-
|
|
430
|
-
|
|
458
|
+
# Close the transport to clean up waitpid thread
|
|
459
|
+
if hasattr(process, "_transport") and process._transport:
|
|
460
|
+
process._transport.close()
|
|
461
|
+
# Also try to close the pipe transport if it exists
|
|
462
|
+
if hasattr(process, "_pipes") and process._pipes:
|
|
463
|
+
for pipe in process._pipes.values():
|
|
464
|
+
if hasattr(pipe, "close"):
|
|
465
|
+
pipe.close()
|
|
466
|
+
except Exception:
|
|
467
|
+
# Ignore errors during cleanup
|
|
431
468
|
pass
|
|
432
469
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
p.
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
470
|
+
# Additional cleanup using psutil to ensure process tree is terminated
|
|
471
|
+
try:
|
|
472
|
+
p = psutil.Process(process.pid)
|
|
473
|
+
if p.is_running():
|
|
474
|
+
# Kill the entire process tree
|
|
475
|
+
for child in p.children(recursive=True):
|
|
476
|
+
try:
|
|
477
|
+
child.kill()
|
|
478
|
+
except psutil.NoSuchProcess:
|
|
479
|
+
pass
|
|
480
|
+
p.kill()
|
|
481
|
+
p.wait(timeout=2.0)
|
|
482
|
+
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
|
483
|
+
# Process already dead or couldn't be killed
|
|
484
|
+
pass
|
|
441
485
|
|
|
442
486
|
async def is_sub_pool_alive(self, process: asyncio.subprocess.Process):
|
|
443
487
|
return process.returncode is None
|
|
Binary file
|
xoscar/backends/pool.py
CHANGED
|
@@ -1337,7 +1337,9 @@ class MainActorPoolBase(ActorPoolBase):
|
|
|
1337
1337
|
return pool
|
|
1338
1338
|
|
|
1339
1339
|
async def start_monitor(self):
|
|
1340
|
-
if
|
|
1340
|
+
# Only start monitor if there are sub processes to monitor
|
|
1341
|
+
# This prevents hanging when n_process=0
|
|
1342
|
+
if self._monitor_task is None and self.sub_processes:
|
|
1341
1343
|
self._monitor_task = asyncio.create_task(self.monitor_sub_pools())
|
|
1342
1344
|
return self._monitor_task
|
|
1343
1345
|
|
|
@@ -1351,7 +1353,12 @@ class MainActorPoolBase(ActorPoolBase):
|
|
|
1351
1353
|
self._auto_recover = False
|
|
1352
1354
|
self._stopped.set()
|
|
1353
1355
|
if self._monitor_task and not self._monitor_task.done():
|
|
1354
|
-
|
|
1356
|
+
# Cancel the monitor task to ensure it exits immediately
|
|
1357
|
+
self._monitor_task.cancel()
|
|
1358
|
+
try:
|
|
1359
|
+
await self._monitor_task
|
|
1360
|
+
except asyncio.CancelledError:
|
|
1361
|
+
pass # Expected when cancelling the task
|
|
1355
1362
|
self._monitor_task = None
|
|
1356
1363
|
await self.stop_sub_pools()
|
|
1357
1364
|
await super().stop()
|
|
@@ -1406,19 +1413,17 @@ class MainActorPoolBase(ActorPoolBase):
|
|
|
1406
1413
|
)
|
|
1407
1414
|
try:
|
|
1408
1415
|
if timeout is None:
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
force = True
|
|
1419
|
-
except (ConnectionError, ServerClosed): # pragma: no cover
|
|
1416
|
+
# Use a short timeout for graceful shutdown to avoid hanging
|
|
1417
|
+
timeout = 2.0
|
|
1418
|
+
|
|
1419
|
+
call = asyncio.create_task(self.call(address, stop_message))
|
|
1420
|
+
try:
|
|
1421
|
+
await asyncio.wait_for(call, timeout)
|
|
1422
|
+
except (futures.TimeoutError, asyncio.TimeoutError):
|
|
1423
|
+
force = True
|
|
1424
|
+
except (ConnectionError, ServerClosed):
|
|
1420
1425
|
# process dead maybe, ignore it
|
|
1421
|
-
|
|
1426
|
+
force = True
|
|
1422
1427
|
# kill process
|
|
1423
1428
|
await self.kill_sub_pool(process, force=force)
|
|
1424
1429
|
|
xoscar/backends/test/pool.py
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
from __future__ import annotations
|
|
17
17
|
|
|
18
18
|
import asyncio
|
|
19
|
+
import sys
|
|
19
20
|
from typing import Any, Optional
|
|
20
21
|
|
|
21
22
|
from ..communication import DummyServer, gen_local_address
|
|
@@ -153,9 +154,44 @@ class TestMainActorPool(MainActorPool):
|
|
|
153
154
|
async def kill_sub_pool(
|
|
154
155
|
self, process: asyncio.subprocess.Process, force: bool = False
|
|
155
156
|
):
|
|
156
|
-
|
|
157
|
+
# Test pool uses None for processes, so skip if process is None
|
|
158
|
+
if process is None:
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
if force:
|
|
162
|
+
try:
|
|
163
|
+
process.kill()
|
|
164
|
+
except ProcessLookupError:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
# Ensure process is completely terminated and cleaned up
|
|
168
|
+
try:
|
|
169
|
+
# Wait for process to complete
|
|
170
|
+
if process.returncode is None:
|
|
171
|
+
try:
|
|
172
|
+
await asyncio.wait_for(process.wait(), timeout=5.0)
|
|
173
|
+
except asyncio.TimeoutError:
|
|
174
|
+
pass
|
|
175
|
+
except ProcessLookupError:
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
# Python 3.13 specific cleanup for waitpid threads
|
|
179
|
+
if sys.version_info >= (3, 13):
|
|
180
|
+
try:
|
|
181
|
+
# Close the transport to clean up waitpid thread
|
|
182
|
+
if hasattr(process, "_transport") and process._transport:
|
|
183
|
+
process._transport.close()
|
|
184
|
+
# Also try to close the pipe transport if it exists
|
|
185
|
+
if hasattr(process, "_pipes") and process._pipes:
|
|
186
|
+
for pipe in process._pipes.values():
|
|
187
|
+
if hasattr(pipe, "close"):
|
|
188
|
+
pipe.close()
|
|
189
|
+
except Exception:
|
|
190
|
+
# Ignore errors during cleanup
|
|
191
|
+
pass
|
|
157
192
|
|
|
158
193
|
async def is_sub_pool_alive(self, process: asyncio.subprocess.Process):
|
|
194
|
+
# Test pool uses None for processes, so always return True
|
|
159
195
|
return True
|
|
160
196
|
|
|
161
197
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# We need to extend cupy's inner class because an actor is a daemonic processes
|
|
16
|
+
# which are not allowed to have children. However, the origin code in cupy
|
|
17
|
+
# will create children processes.
|
|
18
|
+
|
|
19
|
+
import queue
|
|
20
|
+
import socket
|
|
21
|
+
import threading
|
|
22
|
+
from ctypes import sizeof
|
|
23
|
+
|
|
24
|
+
from ...utils import lazy_import
|
|
25
|
+
|
|
26
|
+
cupy = lazy_import("cupy")
|
|
27
|
+
|
|
28
|
+
if cupy is not None:
|
|
29
|
+
import cupyx.distributed
|
|
30
|
+
from cupy.cuda import nccl
|
|
31
|
+
from cupyx.distributed import _klv_utils, _store, _store_actions
|
|
32
|
+
|
|
33
|
+
class ExceptionAwareThreading(threading.Thread):
|
|
34
|
+
def __init__(self, *args, **kwargs):
|
|
35
|
+
super().__init__(*args, **kwargs)
|
|
36
|
+
self._exception = None
|
|
37
|
+
self.q = queue.Queue()
|
|
38
|
+
|
|
39
|
+
def run(self):
|
|
40
|
+
try:
|
|
41
|
+
super().run()
|
|
42
|
+
self.q.put(None)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
self.q.put(e)
|
|
45
|
+
|
|
46
|
+
def join(self):
|
|
47
|
+
super().join()
|
|
48
|
+
if not self.q.empty():
|
|
49
|
+
exception = self.q.get()
|
|
50
|
+
if exception is not None:
|
|
51
|
+
raise exception
|
|
52
|
+
|
|
53
|
+
class TCPStore:
|
|
54
|
+
# This is only used for initialization of nccl so we don't care
|
|
55
|
+
# too much about performance
|
|
56
|
+
def __init__(self, world_size):
|
|
57
|
+
self.storage = {}
|
|
58
|
+
self._thread = None
|
|
59
|
+
self._world_size = world_size
|
|
60
|
+
self._run = 1
|
|
61
|
+
# For implementing a barrier
|
|
62
|
+
self._lock = threading.Lock()
|
|
63
|
+
self._current_barrier = None
|
|
64
|
+
|
|
65
|
+
def __del__(self):
|
|
66
|
+
if not _store._exit_mode:
|
|
67
|
+
self.stop()
|
|
68
|
+
|
|
69
|
+
def _thread_request(self, c_socket):
|
|
70
|
+
with c_socket:
|
|
71
|
+
# Receive in KLV format
|
|
72
|
+
action_bytes = c_socket.recv(sizeof(_klv_utils.action_t))
|
|
73
|
+
if len(action_bytes) > 0:
|
|
74
|
+
action_m = _klv_utils.action_t.from_buffer_copy(action_bytes)
|
|
75
|
+
if action_m.length > 256:
|
|
76
|
+
raise ValueError("Invalid length for message")
|
|
77
|
+
value = bytearray(action_m.value)[: action_m.length]
|
|
78
|
+
r = _store_actions.execute_action(action_m.action, value, self)
|
|
79
|
+
if r is not None:
|
|
80
|
+
c_socket.sendall(r.klv())
|
|
81
|
+
|
|
82
|
+
def _server_loop(self, host, port):
|
|
83
|
+
# This is for minimum info exchange during initialization
|
|
84
|
+
# a single connection allows to implement locking mechanics easily
|
|
85
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
86
|
+
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
87
|
+
s.bind((host, port))
|
|
88
|
+
s.listen()
|
|
89
|
+
s.settimeout(0.5)
|
|
90
|
+
while self._run == 1:
|
|
91
|
+
try:
|
|
92
|
+
c_socket, addr = s.accept()
|
|
93
|
+
except socket.timeout:
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
t = threading.Thread(
|
|
97
|
+
target=self._thread_request, args=(c_socket,), daemon=True
|
|
98
|
+
)
|
|
99
|
+
t.start()
|
|
100
|
+
|
|
101
|
+
def run(self, host=_store._DEFAULT_HOST, port=_store._DEFAULT_PORT):
|
|
102
|
+
# Run the TCP store in a different process
|
|
103
|
+
t = ExceptionAwareThreading(target=self._server_loop, args=(host, port))
|
|
104
|
+
t.start()
|
|
105
|
+
self._thread = t
|
|
106
|
+
|
|
107
|
+
def stop(self):
|
|
108
|
+
if _store._exit_mode:
|
|
109
|
+
return # Prevent shutdown errors
|
|
110
|
+
if self._thread is not None:
|
|
111
|
+
# acquire the lock
|
|
112
|
+
self._lock.acquire()
|
|
113
|
+
self._run = 0
|
|
114
|
+
self._lock.release()
|
|
115
|
+
self._thread.join()
|
|
116
|
+
|
|
117
|
+
class XoscarNCCLBackend(cupyx.distributed.NCCLBackend):
|
|
118
|
+
"""Interface that uses NVIDIA's NCCL to perform communications.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
n_devices (int): Total number of devices that will be used in the
|
|
122
|
+
distributed execution.
|
|
123
|
+
rank (int): Unique id of the GPU that the communicator is associated to
|
|
124
|
+
its value needs to be `0 <= rank < n_devices`.
|
|
125
|
+
host (str, optional): host address for the process rendezvous on
|
|
126
|
+
initialization. Defaults to `"127.0.0.1"`.
|
|
127
|
+
port (int, optional): port used for the process rendezvous on
|
|
128
|
+
initialization. Defaults to `13333`.
|
|
129
|
+
use_mpi(bool, optional): switch between MPI and use the included TCP
|
|
130
|
+
server for initialization & synchronization. Defaults to `False`.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
n_devices,
|
|
136
|
+
rank,
|
|
137
|
+
tcpstore,
|
|
138
|
+
host=_store._DEFAULT_HOST,
|
|
139
|
+
port=_store._DEFAULT_PORT,
|
|
140
|
+
use_mpi=False,
|
|
141
|
+
):
|
|
142
|
+
self._tcpstore = tcpstore
|
|
143
|
+
super().__init__(n_devices, rank, host, port, use_mpi)
|
|
144
|
+
|
|
145
|
+
def _init_with_tcp_store(self, n_devices, rank, host, port):
|
|
146
|
+
nccl_id = None
|
|
147
|
+
if rank == 0:
|
|
148
|
+
self._tcpstore.run(host, port)
|
|
149
|
+
nccl_id = nccl.get_unique_id()
|
|
150
|
+
# get_unique_id return negative values due to cython issues
|
|
151
|
+
# with bytes && c strings. We shift them by 128 to
|
|
152
|
+
# make them positive and send them as bytes to the proxy store
|
|
153
|
+
shifted_nccl_id = bytes([b + 128 for b in nccl_id])
|
|
154
|
+
self._store_proxy["nccl_id"] = shifted_nccl_id
|
|
155
|
+
self._store_proxy.barrier()
|
|
156
|
+
else:
|
|
157
|
+
self._store_proxy.barrier()
|
|
158
|
+
nccl_id = self._store_proxy["nccl_id"]
|
|
159
|
+
nccl_id = tuple([int(b) - 128 for b in nccl_id])
|
|
160
|
+
self._comm = nccl.NcclCommunicator(n_devices, nccl_id, rank)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
xoscar/utils.py
CHANGED
|
@@ -470,6 +470,11 @@ def is_py_312():
|
|
|
470
470
|
return sys.version_info[:2] == (3, 12)
|
|
471
471
|
|
|
472
472
|
|
|
473
|
+
@lru_cache
|
|
474
|
+
def is_py_312_or_above():
|
|
475
|
+
return sys.version_info[:2] >= (3, 12)
|
|
476
|
+
|
|
477
|
+
|
|
473
478
|
def is_v4_zero_ip(ip_port_addr: str) -> bool:
|
|
474
479
|
return ip_port_addr.split("://")[-1].startswith("0.0.0.0:")
|
|
475
480
|
|
xoscar/virtualenv/platform.py
CHANGED
|
@@ -39,7 +39,10 @@ def get_cuda_arch() -> Optional[str]:
|
|
|
39
39
|
|
|
40
40
|
major, minor = torch.cuda.get_device_capability()
|
|
41
41
|
return f"sm_{major}{minor}" # e.g. 'sm_80'
|
|
42
|
-
except (ImportError, AttributeError):
|
|
42
|
+
except (ImportError, AttributeError, AssertionError):
|
|
43
|
+
# If no cuda available,
|
|
44
|
+
# AssertionError("Torch not compiled with CUDA enabled")
|
|
45
|
+
# will be raised
|
|
43
46
|
return None
|
|
44
47
|
|
|
45
48
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xoscar
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Python actor framework for heterogeneous computing.
|
|
5
5
|
Home-page: http://github.com/xorbitsai/xoscar
|
|
6
6
|
Author: Qin Xuye
|
|
@@ -15,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
19
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
19
20
|
Classifier: Topic :: Software Development :: Libraries
|
|
20
21
|
Description-Content-Type: text/markdown
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
xoscar-0.
|
|
2
|
-
xoscar-0.
|
|
3
|
-
xoscar-0.
|
|
4
|
-
xoscar-0.
|
|
1
|
+
xoscar-0.8.0.dist-info/RECORD,,
|
|
2
|
+
xoscar-0.8.0.dist-info/WHEEL,sha256=adaOEtyuP97RoRqh2LNimBtjXokZHsp60v8dUwE5uKE,137
|
|
3
|
+
xoscar-0.8.0.dist-info/top_level.txt,sha256=vYlqqY4Nys8Thm1hePIuUv8eQePdULVWMmt7lXtX_ZA,21
|
|
4
|
+
xoscar-0.8.0.dist-info/METADATA,sha256=XR2fCMVzPBcnQ0vKr6QMLRlwA0NZkKb4xx76-iagFs8,9185
|
|
5
5
|
xoscar/_utils.pyx,sha256=frgVQ5xGp92jBKc4PsPmjOlVsXlKeHWtTOAMfHmBaII,7380
|
|
6
6
|
xoscar/backend.py,sha256=is436OPkZfSpQXaoqTRVta5eoye_pp45RFgCstAk2hU,1850
|
|
7
7
|
xoscar/core.pxd,sha256=I_C2ka7XryyGnnAVXUVm8xfS1gtIrCs6X-9rswgOcUU,1317
|
|
@@ -10,18 +10,18 @@ xoscar/context.pxd,sha256=qKa0OyDPZtVymftSh447m-RzFZgmz8rGqQBa7qlauvc,725
|
|
|
10
10
|
xoscar/batch.py,sha256=DpArS0L3WYJ_HVPG-6hSYEwoAFY1mY2-mlC4Jp5M_Dw,7872
|
|
11
11
|
xoscar/nvutils.py,sha256=qmW4mKLU0WB2yCs198ccQOgLL02zB7Fsa-AotO3NOmg,20412
|
|
12
12
|
xoscar/constants.py,sha256=QHHSREw6uWBBjQDCFqlNfTvBZgniJPGy42KSIsR8Fqw,787
|
|
13
|
-
xoscar/_utils.cpython-310-darwin.so,sha256=
|
|
13
|
+
xoscar/_utils.cpython-310-darwin.so,sha256=xC42ijCc1Pi1H0JOrQJkFgEMJs-amW82BGcA2IUNmzQ,169584
|
|
14
14
|
xoscar/__init__.py,sha256=sy7Wtn2EuQZI0I4Az_MfsBVZm4G0DRj46qRyExgmnJk,1622
|
|
15
15
|
xoscar/api.py,sha256=zxNqOjGiTIKuAip9WJ0LOoM7yevD6P5rb-sLynpZ2Zo,14648
|
|
16
|
-
xoscar/utils.py,sha256=
|
|
16
|
+
xoscar/utils.py,sha256=vo1DS6xHnvZ9jDS9qpGFFN4GCJDo6nyhMINWOna5dp8,16582
|
|
17
17
|
xoscar/debug.py,sha256=9Z8SgE2WaKYQcyDo-5-DxEJQ533v7kWjrvCd28pSx3E,5069
|
|
18
18
|
xoscar/libcpp.pxd,sha256=DJqBxLFOKL4iRr9Kale5UH3rbvPRD1x5bTSOPHFpz9I,1147
|
|
19
19
|
xoscar/context.pyx,sha256=8CdgPnWcE9eOp3N600WgDQ03MCi8P73eUOGcfV7Zksg,10942
|
|
20
20
|
xoscar/errors.py,sha256=wBlQOKsXf0Fc4skN39tDie0YZT-VIAuLNRgoDl2pZcA,1241
|
|
21
21
|
xoscar/core.pyx,sha256=phN-yYV0A0QI8WFi2jCu0nc4CnShTepfDi0V7ZrLYPY,22092
|
|
22
22
|
xoscar/driver.py,sha256=498fowtJr6b3FE8FIOA_Tc1Vwx88nfZw7p0FxrML0h4,1372
|
|
23
|
-
xoscar/context.cpython-310-darwin.so,sha256=
|
|
24
|
-
xoscar/core.cpython-310-darwin.so,sha256=
|
|
23
|
+
xoscar/context.cpython-310-darwin.so,sha256=Aa5Xe4alRLdaFTlhCZaGFv8YSmbj89StkO_BRUmuUPk,211328
|
|
24
|
+
xoscar/core.cpython-310-darwin.so,sha256=ZSPlzx0miEEuVKR5RMeM3ESkCAjFMbwxSw1gjra02p0,409608
|
|
25
25
|
xoscar/profiling.py,sha256=BC5OF0HzSaXv8V7w-y-B8r5gV5DgxHFoTEIF6jCMioQ,8015
|
|
26
26
|
xoscar/_utils.pxd,sha256=5KYAL3jfPdejsHnrGGT2s--ZUX5SXznQWpHVSno429k,1157
|
|
27
27
|
xoscar/metrics/__init__.py,sha256=9Badi7rxYikGm2dQiNCrj9GgMRBxwuR3JaEKcFZmfak,705
|
|
@@ -32,12 +32,14 @@ xoscar/metrics/backends/prometheus/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4
|
|
|
32
32
|
xoscar/metrics/backends/prometheus/prometheus_metric.py,sha256=MxoMvVrg0pOkKpkjJ0PcAuEaaEJR2FZljmPrLjQ1-oc,2050
|
|
33
33
|
xoscar/metrics/backends/console/console_metric.py,sha256=y5CCtH33j3AqI5_Uhwi4mgOcAhyhb4cWv_YvR6fxcbQ,2082
|
|
34
34
|
xoscar/metrics/backends/console/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
35
|
-
xoscar/collective/xoscar_pygloo.cpython-310-darwin.so,sha256=
|
|
35
|
+
xoscar/collective/xoscar_pygloo.cpython-310-darwin.so,sha256=pDCrh35TJYP1qlzbq1GRUayCSxh-jxWlWdSjtYSuEzE,1127552
|
|
36
36
|
xoscar/collective/__init__.py,sha256=XsClIkO_3Jd8GDifYuAbZCmJLAo9ZqGvnjUn9iuogmU,774
|
|
37
37
|
xoscar/collective/core.py,sha256=NVR-7Iaq3aDPCN6fgXcq9Ew6uFEszRwxYqmUG9FLcws,23502
|
|
38
38
|
xoscar/collective/common.py,sha256=INAnISbfnRicbbbDHTqbSr9ITb89ZphH5BUkSpEdXXU,3561
|
|
39
39
|
xoscar/collective/utils.py,sha256=3S4qF4JEnAUD3RiWVBUj-ZptL83CBSwGYyVZyIasAsE,1178
|
|
40
40
|
xoscar/collective/process_group.py,sha256=zy7LcIFnEcmrcxuECI89v0bQlUbSqQMkVyBw468WBnk,22599
|
|
41
|
+
xoscar/collective/backend/nccl_backend.py,sha256=7VvjAVTkr6qWJC1CztzJ5CN9USGJkstO-RAbaPKQA-Y,6280
|
|
42
|
+
xoscar/collective/backend/__init__.py,sha256=CyLLkbImZouAk4lePIgKXT4WQoqyauIEwdqea5IOUVU,581
|
|
41
43
|
xoscar/serialization/exception.py,sha256=Jy8Lsk0z-VJyEUaWeuZIwkmxqaoB-nLKMa1D15Cl4js,1634
|
|
42
44
|
xoscar/serialization/pyfury.py,sha256=sifOnVMYoS82PzZEkzkfxesmMHei23k5UAUUKUyoOYQ,1163
|
|
43
45
|
xoscar/serialization/core.pxd,sha256=k4RoJgX5E5LGs4jdCQ7vvcn26MabXbrWoWhkO49X6YI,985
|
|
@@ -47,33 +49,33 @@ xoscar/serialization/cuda.py,sha256=iFUEnN4SiquBIhyieyOrfw3TnKnW-tU_vYgqOxO_DrA,
|
|
|
47
49
|
xoscar/serialization/scipy.py,sha256=yOEi0NB8cqQ6e2UnCZ1w006RsB7T725tIL-DM_hNcsU,2482
|
|
48
50
|
xoscar/serialization/aio.py,sha256=5DySPgDxU43ec7_5Ct44-Oqt7YNSJBfuf8VdQgQlChA,4731
|
|
49
51
|
xoscar/serialization/core.pyx,sha256=bjR-zXGm9qersk7kYPzpjpMIxDl_Auur4BCubRfKmfA,29626
|
|
50
|
-
xoscar/serialization/core.cpython-310-darwin.so,sha256=
|
|
52
|
+
xoscar/serialization/core.cpython-310-darwin.so,sha256=G1JW-fCbnCIAzGIlx6PJqTC7kdRBYnat2iXC1rON9VU,363288
|
|
51
53
|
xoscar/serialization/mlx.py,sha256=tRu_7o6RizdRhbr88EasHrZtShimAsLy3pIEO-by29o,2118
|
|
52
54
|
xoscar/backends/config.py,sha256=4tZMiXAMMS8qQ4SX_LjONLtSQVfZTx3m-IK3EqbkYdk,5375
|
|
53
55
|
xoscar/backends/allocate_strategy.py,sha256=tC1Nbq2tJohahUwd-zoRYHEDX65wyuX8tmeY45uWj_w,4845
|
|
54
|
-
xoscar/backends/message.cpython-310-darwin.so,sha256=
|
|
56
|
+
xoscar/backends/message.cpython-310-darwin.so,sha256=Sr1-FSsEGCIcum9TUAQzU3g1wuqWKb3l4xT66iImPGI,349424
|
|
55
57
|
xoscar/backends/__init__.py,sha256=VHEBQcUWM5bj027W8EUf9PiJUAP7JoMrRw3Tsvy5ySw,643
|
|
56
|
-
xoscar/backends/core.py,sha256=
|
|
58
|
+
xoscar/backends/core.py,sha256=hhowZJ1PmsmK1ap3IkIny-50cM3qawYbtUHVSBxzfPc,12845
|
|
57
59
|
xoscar/backends/context.py,sha256=XfDPG2eDhAhE6hWBEkEsHTnyyOYN9R3houlMjAL7BFw,16329
|
|
58
60
|
xoscar/backends/router.py,sha256=MVl5naz-FYf-Wla7XRn3kRxOpWV0SjKDsKNluifVA8M,10532
|
|
59
61
|
xoscar/backends/message.pyx,sha256=krGVtZ1YDaZX8yWhaNHwZiudQooLvcGlw6x3Sq7jxjE,19685
|
|
60
|
-
xoscar/backends/pool.py,sha256=
|
|
62
|
+
xoscar/backends/pool.py,sha256=7yyU5EzU0Izf3j8ifGovpGbGR3KrCoYIU0L3JH_e9WM,60683
|
|
61
63
|
xoscar/backends/indigen/backend.py,sha256=znl_fZzWGEtLH8hZ9j9Kkf0fva25jEem2_KO7I1RVvc,1612
|
|
62
64
|
xoscar/backends/indigen/shared_memory.py,sha256=wqbckbgnd0qNm5KzlP_hklF3F_n8fKnCehSox5uMwNs,19082
|
|
63
65
|
xoscar/backends/indigen/__init__.py,sha256=tKHP5ClzedBRBpZsLRVErR3EUNbbDm4CY4u0rCFJr44,685
|
|
64
66
|
xoscar/backends/indigen/fate_sharing.py,sha256=3QUHwq5Cjk9oCKFUISvkqHaoxWZIaXcq8JNOetdBl-A,8655
|
|
65
67
|
xoscar/backends/indigen/driver.py,sha256=VGzkacYKykegW5qhCuhx01gdgBZEKJjNIyfNCnA6Nm8,952
|
|
66
|
-
xoscar/backends/indigen/pool.py,sha256=
|
|
68
|
+
xoscar/backends/indigen/pool.py,sha256=RAbXNkb8y5QElAY2zcD6NBb8XBT5bt2GqFc12H3SwqY,18504
|
|
67
69
|
xoscar/backends/indigen/__main__.py,sha256=-pfio-Y4Ogbk6lBFksH-gRatp-N6sZ7wuNc-i2YsLJc,510
|
|
68
70
|
xoscar/backends/test/backend.py,sha256=nv9WFhH5Bbq4Q1HB9yfpciZBaeHT4IQAtzugBWESrUY,1263
|
|
69
71
|
xoscar/backends/test/__init__.py,sha256=j2ZfD6prD9WjUxRUDC7Eq5Z7N7TkL6fFr59oNyc_vY4,682
|
|
70
|
-
xoscar/backends/test/pool.py,sha256=
|
|
72
|
+
xoscar/backends/test/pool.py,sha256=tPq-MKHmiB9SOVo64y6Nv-U3evdxVQZuuWLYR-fj3ZA,8257
|
|
71
73
|
xoscar/backends/communication/ucx.py,sha256=_Dp9Ld2MWIa1txSGMnmfYwJDT0esxS-GOd2FQ4BdHiM,19960
|
|
72
74
|
xoscar/backends/communication/__init__.py,sha256=oFIg83Ga93-AhrG52TE85Z2LgpGZu1RCgQu1RWi62zQ,1063
|
|
73
75
|
xoscar/backends/communication/core.py,sha256=sJeE3foRIqVPXldzYpFKHDSsabfAIFBU4JuXY4OyklY,2130
|
|
74
76
|
xoscar/backends/communication/utils.py,sha256=AmovE-hmWLXNCPwHafYuaRjOk8m42BUyT3XBqfXQRVI,3664
|
|
75
77
|
xoscar/backends/communication/errors.py,sha256=V3CdBe2xX9Rwv32f2dH2Msc84yaUhlyerZ42-739o1Q,723
|
|
76
|
-
xoscar/backends/communication/socket.py,sha256=
|
|
78
|
+
xoscar/backends/communication/socket.py,sha256=8GW-7HNnZE7tqvLyU6L9B75kaQvI4atc6AGwVHq1IGE,14482
|
|
77
79
|
xoscar/backends/communication/dummy.py,sha256=6kLkxjNk4xTQ-IlNZD6cftNCx5UsGOur2jk7ikrNUCg,8157
|
|
78
80
|
xoscar/backends/communication/base.py,sha256=0P4Tr35GSWpRp394e9jVWUUoKKa-gIk177eYPw1BnSU,7421
|
|
79
81
|
xoscar/aio/__init__.py,sha256=kViDKR_kJe59VQViHITKEfBcIgN4ZJblUyd8zl0E3ZI,675
|
|
@@ -83,6 +85,6 @@ xoscar/aio/parallelism.py,sha256=VSsjk8wP-Bw7tLeUsTyLVNgp91thjxEfE3pCrw_vF5Q,129
|
|
|
83
85
|
xoscar/aio/base.py,sha256=9j0f1piwfE5R5GIvV212vSD03ixdaeSzSSsO2kxJZVE,2249
|
|
84
86
|
xoscar/virtualenv/__init__.py,sha256=65t9_X1DvbanNjFy366SiiWZrRTpa9SXWMXPmqayE-4,1117
|
|
85
87
|
xoscar/virtualenv/core.py,sha256=Ij36UQaej9fFaz1PfqkEtL1ss8yBribXHcWT115kH-o,8098
|
|
86
|
-
xoscar/virtualenv/platform.py,sha256=
|
|
88
|
+
xoscar/virtualenv/platform.py,sha256=JGEp6tbCeACr3MC9UAdaOGJIJEKjWFRtScpKuGX0_Hs,1541
|
|
87
89
|
xoscar/virtualenv/utils.py,sha256=qKHw7Gg0n3JuzKFjhBnftPq2QWlgNJLk1sGPr5GzamM,2875
|
|
88
90
|
xoscar/virtualenv/uv.py,sha256=VBw045LN8gYMLgjaazt7-tnwBveWr7YYE2zjDsL18h0,11091
|
|
File without changes
|
|
File without changes
|