xoscar 0.9.0__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. xoscar/__init__.py +61 -0
  2. xoscar/_utils.cpython-312-darwin.so +0 -0
  3. xoscar/_utils.pxd +36 -0
  4. xoscar/_utils.pyx +246 -0
  5. xoscar/_version.py +693 -0
  6. xoscar/aio/__init__.py +16 -0
  7. xoscar/aio/base.py +86 -0
  8. xoscar/aio/file.py +59 -0
  9. xoscar/aio/lru.py +228 -0
  10. xoscar/aio/parallelism.py +39 -0
  11. xoscar/api.py +527 -0
  12. xoscar/backend.py +67 -0
  13. xoscar/backends/__init__.py +14 -0
  14. xoscar/backends/allocate_strategy.py +160 -0
  15. xoscar/backends/communication/__init__.py +30 -0
  16. xoscar/backends/communication/base.py +315 -0
  17. xoscar/backends/communication/core.py +69 -0
  18. xoscar/backends/communication/dummy.py +253 -0
  19. xoscar/backends/communication/errors.py +20 -0
  20. xoscar/backends/communication/socket.py +444 -0
  21. xoscar/backends/communication/ucx.py +538 -0
  22. xoscar/backends/communication/utils.py +97 -0
  23. xoscar/backends/config.py +157 -0
  24. xoscar/backends/context.py +437 -0
  25. xoscar/backends/core.py +352 -0
  26. xoscar/backends/indigen/__init__.py +16 -0
  27. xoscar/backends/indigen/__main__.py +19 -0
  28. xoscar/backends/indigen/backend.py +51 -0
  29. xoscar/backends/indigen/driver.py +26 -0
  30. xoscar/backends/indigen/fate_sharing.py +221 -0
  31. xoscar/backends/indigen/pool.py +515 -0
  32. xoscar/backends/indigen/shared_memory.py +548 -0
  33. xoscar/backends/message.cpython-312-darwin.so +0 -0
  34. xoscar/backends/message.pyi +255 -0
  35. xoscar/backends/message.pyx +646 -0
  36. xoscar/backends/pool.py +1630 -0
  37. xoscar/backends/router.py +285 -0
  38. xoscar/backends/test/__init__.py +16 -0
  39. xoscar/backends/test/backend.py +38 -0
  40. xoscar/backends/test/pool.py +233 -0
  41. xoscar/batch.py +256 -0
  42. xoscar/collective/__init__.py +27 -0
  43. xoscar/collective/backend/__init__.py +13 -0
  44. xoscar/collective/backend/nccl_backend.py +160 -0
  45. xoscar/collective/common.py +102 -0
  46. xoscar/collective/core.py +737 -0
  47. xoscar/collective/process_group.py +687 -0
  48. xoscar/collective/utils.py +41 -0
  49. xoscar/collective/xoscar_pygloo.cpython-312-darwin.so +0 -0
  50. xoscar/collective/xoscar_pygloo.pyi +239 -0
  51. xoscar/constants.py +23 -0
  52. xoscar/context.cpython-312-darwin.so +0 -0
  53. xoscar/context.pxd +21 -0
  54. xoscar/context.pyx +368 -0
  55. xoscar/core.cpython-312-darwin.so +0 -0
  56. xoscar/core.pxd +51 -0
  57. xoscar/core.pyx +664 -0
  58. xoscar/debug.py +188 -0
  59. xoscar/driver.py +42 -0
  60. xoscar/errors.py +63 -0
  61. xoscar/libcpp.pxd +31 -0
  62. xoscar/metrics/__init__.py +21 -0
  63. xoscar/metrics/api.py +288 -0
  64. xoscar/metrics/backends/__init__.py +13 -0
  65. xoscar/metrics/backends/console/__init__.py +13 -0
  66. xoscar/metrics/backends/console/console_metric.py +82 -0
  67. xoscar/metrics/backends/metric.py +149 -0
  68. xoscar/metrics/backends/prometheus/__init__.py +13 -0
  69. xoscar/metrics/backends/prometheus/prometheus_metric.py +70 -0
  70. xoscar/nvutils.py +717 -0
  71. xoscar/profiling.py +260 -0
  72. xoscar/serialization/__init__.py +20 -0
  73. xoscar/serialization/aio.py +141 -0
  74. xoscar/serialization/core.cpython-312-darwin.so +0 -0
  75. xoscar/serialization/core.pxd +28 -0
  76. xoscar/serialization/core.pyi +57 -0
  77. xoscar/serialization/core.pyx +944 -0
  78. xoscar/serialization/cuda.py +111 -0
  79. xoscar/serialization/exception.py +48 -0
  80. xoscar/serialization/mlx.py +67 -0
  81. xoscar/serialization/numpy.py +82 -0
  82. xoscar/serialization/pyfury.py +37 -0
  83. xoscar/serialization/scipy.py +72 -0
  84. xoscar/serialization/torch.py +180 -0
  85. xoscar/utils.py +522 -0
  86. xoscar/virtualenv/__init__.py +34 -0
  87. xoscar/virtualenv/core.py +268 -0
  88. xoscar/virtualenv/platform.py +56 -0
  89. xoscar/virtualenv/utils.py +100 -0
  90. xoscar/virtualenv/uv.py +321 -0
  91. xoscar-0.9.0.dist-info/METADATA +230 -0
  92. xoscar-0.9.0.dist-info/RECORD +94 -0
  93. xoscar-0.9.0.dist-info/WHEEL +6 -0
  94. xoscar-0.9.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,515 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ # derived from copyright 1999-2021 Alibaba Group Holding Ltd.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import asyncio.subprocess
20
+ import configparser
21
+ import itertools
22
+ import logging.config
23
+ import os
24
+ import pickle
25
+ import random
26
+ import signal
27
+ import struct
28
+ import sys
29
+ import threading
30
+ import time
31
+ import uuid
32
+ from enum import IntEnum
33
+ from typing import List, Optional
34
+
35
+ import psutil
36
+
37
+ from ..._utils import reset_id_random_seed
38
+ from ...utils import ensure_coverage
39
+ from ..config import ActorPoolConfig
40
+ from ..message import (
41
+ ControlMessage,
42
+ ControlMessageType,
43
+ CreateActorMessage,
44
+ new_message_id,
45
+ )
46
+ from ..pool import MainActorPoolBase, SubActorPoolBase, _register_message_handler
47
+ from . import shared_memory
48
+ from .fate_sharing import create_subprocess_exec
49
+
50
+ _SUBPROCESS_SHM_SIZE = 10240
51
+ _is_windows: bool = sys.platform.startswith("win")
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ class _ShmSeq(IntEnum):
57
+ INIT_PARAMS = 1
58
+ INIT_RESULT = 2
59
+
60
+
61
+ def _shm_put_object(seq: _ShmSeq, shm: shared_memory.SharedMemory, o: object):
62
+ serialized = pickle.dumps(o)
63
+ assert (
64
+ len(serialized) < _SUBPROCESS_SHM_SIZE - 8
65
+ ), f"Serialized object {o} is too long."
66
+ shm.buf[4:8] = struct.pack("<I", len(serialized))
67
+ shm.buf[8 : 8 + len(serialized)] = serialized
68
+ shm.buf[:4] = struct.pack("<I", seq)
69
+
70
+
71
+ def _shm_get_object(seq: _ShmSeq, shm: shared_memory.SharedMemory):
72
+ recv_seq = struct.unpack("<I", shm.buf[:4])[0]
73
+ if recv_seq != seq:
74
+ return
75
+ size = struct.unpack("<I", shm.buf[4:8])[0]
76
+ return pickle.loads(shm.buf[8 : 8 + size])
77
+
78
+
79
+ @_register_message_handler
80
+ class MainActorPool(MainActorPoolBase):
81
+ @classmethod
82
+ def get_external_addresses(
83
+ cls,
84
+ address: str,
85
+ n_process: int | None = None,
86
+ ports: list[int] | None = None,
87
+ schemes: list[Optional[str]] | None = None,
88
+ ):
89
+ """Get external address for every process"""
90
+ assert n_process is not None
91
+ if ":" in address:
92
+ host, port_str = address.rsplit(":", 1)
93
+ port = int(port_str)
94
+ if ports:
95
+ if len(ports) != n_process:
96
+ raise ValueError(
97
+ f"`ports` specified, but its count "
98
+ f"is not equal to `n_process`, "
99
+ f"number of ports: {len(ports)}, "
100
+ f"n_process: {n_process}"
101
+ )
102
+ sub_ports = ports
103
+ else:
104
+ sub_ports = [0] * n_process
105
+ else:
106
+ host = address
107
+ if ports and len(ports) != n_process + 1:
108
+ # ports specified, the first of which should be main port
109
+ raise ValueError(
110
+ f"`ports` specified, but its count "
111
+ f"is not equal to `n_process` + 1, "
112
+ f"number of ports: {len(ports)}, "
113
+ f"n_process + 1: {n_process + 1}"
114
+ )
115
+ elif not ports:
116
+ ports = [0] * (n_process + 1)
117
+ port = ports[0]
118
+ sub_ports = ports[1:]
119
+ if not schemes:
120
+ prefix_iter = itertools.repeat("")
121
+ else:
122
+ prefix_iter = [f"{scheme}://" if scheme else "" for scheme in schemes] # type: ignore
123
+ return [
124
+ f"{prefix}{host}:{port}"
125
+ for port, prefix in zip([port] + sub_ports, prefix_iter)
126
+ ]
127
+
128
+ @classmethod
129
+ def gen_internal_address(
130
+ cls, process_index: int, external_address: str | None = None
131
+ ) -> str | None:
132
+ if hasattr(asyncio, "start_unix_server"):
133
+ return f"unixsocket:///{process_index}"
134
+ else:
135
+ return external_address
136
+
137
+ @classmethod
138
+ async def start_sub_pool(
139
+ cls,
140
+ actor_pool_config: ActorPoolConfig,
141
+ process_index: int,
142
+ start_python: str | None = None,
143
+ ):
144
+ return await cls._create_sub_pool_from_parent(
145
+ actor_pool_config, process_index, start_python
146
+ )
147
+
148
+ @classmethod
149
+ async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
150
+ processes: list[asyncio.subprocess.Process] = []
151
+ ext_addresses = []
152
+ error = None
153
+ for task in create_pool_tasks:
154
+ process, address = await task
155
+ processes.append(process)
156
+ ext_addresses.append(address)
157
+ if error:
158
+ for p in processes:
159
+ # error happens, kill all subprocesses
160
+ p.kill()
161
+ raise error
162
+ return processes, ext_addresses
163
+
164
+ @classmethod
165
+ def _start_sub_pool_in_child(
166
+ cls,
167
+ shm_name: str,
168
+ ):
169
+ ensure_coverage()
170
+
171
+ shm = shared_memory.SharedMemory(shm_name, track=False)
172
+ try:
173
+ config = _shm_get_object(_ShmSeq.INIT_PARAMS, shm)
174
+ # Check Python version once.
175
+ sub_pool_python_version = config.pop("python_version", None)
176
+ if (
177
+ sub_pool_python_version is not None
178
+ and sub_pool_python_version != sys.hexversion
179
+ ):
180
+ logger.warning(
181
+ f"The sub pool is using a different Python version, you may encounter serialization issues."
182
+ f" sub pool: {sub_pool_python_version}, main pool: {sys.hexversion}"
183
+ )
184
+ actor_config = config["actor_pool_config"]
185
+ process_index = config["process_index"]
186
+ main_pool_pid = config["main_pool_pid"]
187
+
188
+ def _check_ppid():
189
+ while True:
190
+ try:
191
+ # We can't simply check if the os.getppid() equals with main_pool_pid,
192
+ # as the double fork may result in a new process as the parent.
193
+ psutil.Process(main_pool_pid)
194
+ except psutil.NoSuchProcess:
195
+ logger.error("Exit due to main pool %s exit.", main_pool_pid)
196
+ os._exit(233) # Special exit code for debugging.
197
+ except Exception as e:
198
+ logger.exception("Check ppid failed: %s", e)
199
+ time.sleep(10)
200
+
201
+ t = threading.Thread(target=_check_ppid, daemon=True)
202
+ t.start()
203
+
204
+ # make sure enough randomness for every sub pool
205
+ random.seed(uuid.uuid1().bytes)
206
+ reset_id_random_seed()
207
+
208
+ conf = actor_config.get_pool_config(process_index)
209
+ suspend_sigint = conf["suspend_sigint"]
210
+ if suspend_sigint:
211
+ signal.signal(signal.SIGINT, lambda *_: None)
212
+
213
+ logging_conf = conf["logging_conf"] or {}
214
+ if isinstance(logging_conf, configparser.RawConfigParser):
215
+ logging.config.fileConfig(logging_conf)
216
+ elif logging_conf.get("dict"):
217
+ logging.config.dictConfig(logging_conf["dict"])
218
+ elif logging_conf.get("file"):
219
+ logging.config.fileConfig(logging_conf["file"])
220
+ elif logging_conf.get("level"):
221
+ logging.getLogger("__main__").setLevel(logging_conf["level"])
222
+ logging.getLogger("xoscar").setLevel(logging_conf["level"])
223
+ if logging_conf.get("format"):
224
+ logging.basicConfig(format=logging_conf["format"])
225
+
226
+ use_uvloop = conf["use_uvloop"]
227
+ if use_uvloop:
228
+ import uvloop
229
+
230
+ asyncio.set_event_loop(uvloop.new_event_loop())
231
+ else:
232
+ asyncio.set_event_loop(asyncio.new_event_loop())
233
+
234
+ coro = cls._create_sub_pool(actor_config, process_index, main_pool_pid, shm)
235
+ asyncio.run(coro)
236
+ finally:
237
+ shm.close()
238
+
239
+ @classmethod
240
+ async def _create_sub_pool(
241
+ cls,
242
+ actor_config: ActorPoolConfig,
243
+ process_index: int,
244
+ main_pool_pid: int,
245
+ shm: shared_memory.SharedMemory,
246
+ ):
247
+ cur_pool_config = actor_config.get_pool_config(process_index)
248
+ env = cur_pool_config["env"]
249
+ if env:
250
+ os.environ.update(env)
251
+ pool = await SubActorPool.create(
252
+ {
253
+ "actor_pool_config": actor_config,
254
+ "process_index": process_index,
255
+ "main_pool_pid": main_pool_pid,
256
+ }
257
+ )
258
+ await pool.start()
259
+ _shm_put_object(_ShmSeq.INIT_RESULT, shm, cur_pool_config["external_address"])
260
+ await pool.join()
261
+
262
+ @staticmethod
263
+ async def _create_sub_pool_from_parent(
264
+ actor_pool_config: ActorPoolConfig,
265
+ process_index: int,
266
+ start_python: str | None = None,
267
+ ):
268
+ # We check the Python version in _shm_get_object to make it faster,
269
+ # as in most cases the Python versions are the same.
270
+ if start_python is None:
271
+ start_python = sys.executable
272
+
273
+ external_addresses: List | None = None
274
+ shm = shared_memory.SharedMemory(
275
+ create=True, size=_SUBPROCESS_SHM_SIZE, track=False
276
+ )
277
+ try:
278
+ _shm_put_object(
279
+ _ShmSeq.INIT_PARAMS,
280
+ shm,
281
+ {
282
+ "actor_pool_config": actor_pool_config,
283
+ "process_index": process_index,
284
+ "main_pool_pid": os.getpid(),
285
+ "python_version": sys.hexversion,
286
+ },
287
+ )
288
+ cmd = [
289
+ start_python,
290
+ "-m",
291
+ "xoscar.backends.indigen",
292
+ "start_sub_pool",
293
+ "-sn",
294
+ shm.name,
295
+ ]
296
+ # We need to inherit the parent environment to ensure the subprocess works correctly on Windows.
297
+ new_env = dict(os.environ)
298
+ env = actor_pool_config.get_pool_config(process_index).get("env") or {}
299
+ new_env.update(env)
300
+ if os.getenv("XOSCAR_CPU_AFFINITY") == "1":
301
+ import multiprocessing
302
+
303
+ total_cores = multiprocessing.cpu_count()
304
+ all_cores_range = f"0-{total_cores - 1}"
305
+ cmd = ["taskset", "-c", all_cores_range] + cmd
306
+ logger.info("Creating sub pool via command: %s", cmd)
307
+ process = await create_subprocess_exec(*cmd, env=new_env)
308
+
309
+ def _get_external_addresses():
310
+ try:
311
+ nonlocal external_addresses
312
+ while (
313
+ shm
314
+ and shm.buf is not None
315
+ and not (
316
+ external_addresses := _shm_get_object(
317
+ _ShmSeq.INIT_RESULT, shm
318
+ )
319
+ )
320
+ ):
321
+ time.sleep(0.1)
322
+ except asyncio.CancelledError:
323
+ pass
324
+
325
+ _, unfinished = await asyncio.wait(
326
+ [
327
+ asyncio.create_task(process.wait()),
328
+ asyncio.create_task(asyncio.to_thread(_get_external_addresses)),
329
+ ],
330
+ return_when=asyncio.FIRST_COMPLETED,
331
+ )
332
+ for t in unfinished:
333
+ t.cancel()
334
+ finally:
335
+ shm.close()
336
+ shm.unlink()
337
+ if external_addresses is None:
338
+ raise OSError(f"Start sub pool failed, returncode: {process.returncode}")
339
+ return process, external_addresses
340
+
341
+ async def append_sub_pool(
342
+ self,
343
+ label: str | None = None,
344
+ internal_address: str | None = None,
345
+ external_address: str | None = None,
346
+ env: dict | None = None,
347
+ modules: list[str] | None = None,
348
+ suspend_sigint: bool | None = None,
349
+ use_uvloop: bool | None = None,
350
+ logging_conf: dict | None = None,
351
+ start_python: str | None = None,
352
+ kwargs: dict | None = None,
353
+ ):
354
+ # external_address has port 0, subprocess will bind random port.
355
+ external_address = (
356
+ external_address
357
+ or MainActorPool.get_external_addresses(self.external_address, n_process=1)[
358
+ -1
359
+ ]
360
+ )
361
+
362
+ # use last process index's logging_conf and use_uv_loop config if not provide
363
+ actor_pool_config = self._config.as_dict()
364
+ last_process_index = self._config.get_process_indexes()[-1]
365
+ last_logging_conf = actor_pool_config["pools"][last_process_index][
366
+ "logging_conf"
367
+ ]
368
+ last_use_uv_loop = actor_pool_config["pools"][last_process_index]["use_uvloop"]
369
+ _logging_conf = logging_conf or last_logging_conf
370
+ _use_uv_loop = use_uvloop if use_uvloop is not None else last_use_uv_loop
371
+
372
+ process_index = next(MainActorPool.process_index_gen(external_address))
373
+ internal_address = internal_address or MainActorPool.gen_internal_address(
374
+ process_index, external_address
375
+ )
376
+
377
+ self._config.add_pool_conf(
378
+ process_index,
379
+ label,
380
+ internal_address,
381
+ external_address,
382
+ env,
383
+ modules,
384
+ suspend_sigint,
385
+ _use_uv_loop,
386
+ _logging_conf,
387
+ kwargs,
388
+ )
389
+
390
+ process, external_addresses = await self._create_sub_pool_from_parent(
391
+ self._config, process_index, start_python
392
+ )
393
+
394
+ self._config.reset_pool_external_address(process_index, external_addresses[0])
395
+ self.attach_sub_process(external_addresses[0], process)
396
+
397
+ control_message = ControlMessage(
398
+ message_id=new_message_id(),
399
+ address=self.external_address,
400
+ control_message_type=ControlMessageType.sync_config,
401
+ content=self._config,
402
+ )
403
+ await self.handle_control_command(control_message)
404
+ # The actual port will return in process_status.
405
+ return external_addresses[0]
406
+
407
+ async def remove_sub_pool(
408
+ self, external_address: str, timeout: float | None = None, force: bool = False
409
+ ):
410
+ process = self.sub_processes[external_address]
411
+ process_index = self._config.get_process_index(external_address)
412
+ del self.sub_processes[external_address]
413
+ self._config.remove_pool_config(process_index)
414
+ await self.stop_sub_pool(external_address, process, timeout, force)
415
+
416
+ control_message = ControlMessage(
417
+ message_id=new_message_id(),
418
+ address=self.external_address,
419
+ control_message_type=ControlMessageType.sync_config,
420
+ content=self._config,
421
+ )
422
+ await self.handle_control_command(control_message)
423
+
424
+ async def kill_sub_pool(
425
+ self, process: asyncio.subprocess.Process, force: bool = False
426
+ ):
427
+ # First, try to terminate the process gracefully
428
+ if not force:
429
+ try:
430
+ process.terminate()
431
+ # Wait for graceful termination
432
+ try:
433
+ await asyncio.wait_for(process.wait(), timeout=2.0)
434
+ except asyncio.TimeoutError:
435
+ # Process didn't terminate gracefully, force kill
436
+ force = True
437
+ except ProcessLookupError:
438
+ # Process already terminated
439
+ pass
440
+
441
+ # Force kill if needed or if graceful termination failed
442
+ if force:
443
+ try:
444
+ process.kill()
445
+ except ProcessLookupError:
446
+ # Process already dead
447
+ pass
448
+
449
+ # Ensure process is completely terminated and cleaned up
450
+ try:
451
+ # Wait for process to complete
452
+ if process.returncode is None:
453
+ try:
454
+ await asyncio.wait_for(process.wait(), timeout=5.0)
455
+ except asyncio.TimeoutError:
456
+ pass
457
+ except ProcessLookupError:
458
+ # Process already terminated
459
+ pass
460
+
461
+ # Python 3.13 specific cleanup for waitpid threads
462
+ if sys.version_info >= (3, 13):
463
+ try:
464
+ # Close the transport to clean up waitpid thread
465
+ if hasattr(process, "_transport") and process._transport:
466
+ process._transport.close()
467
+ # Also try to close the pipe transport if it exists
468
+ if hasattr(process, "_pipes") and process._pipes:
469
+ for pipe in process._pipes.values():
470
+ if hasattr(pipe, "close"):
471
+ pipe.close()
472
+ except Exception:
473
+ # Ignore errors during cleanup
474
+ pass
475
+
476
+ # Additional cleanup using psutil to ensure process tree is terminated
477
+ try:
478
+ p = psutil.Process(process.pid)
479
+ if p.is_running():
480
+ # Kill the entire process tree
481
+ for child in p.children(recursive=True):
482
+ try:
483
+ child.kill()
484
+ except psutil.NoSuchProcess:
485
+ pass
486
+ p.kill()
487
+ p.wait(timeout=2.0)
488
+ except (psutil.NoSuchProcess, psutil.TimeoutExpired):
489
+ # Process already dead or couldn't be killed
490
+ pass
491
+
492
+ async def is_sub_pool_alive(self, process: asyncio.subprocess.Process):
493
+ return process.returncode is None
494
+
495
+ async def recover_sub_pool(self, address: str):
496
+ process_index = self._config.get_process_index(address)
497
+ # process dead, restart it
498
+ # remember always use spawn to recover sub pool
499
+ task = asyncio.create_task(self.start_sub_pool(self._config, process_index))
500
+ self.sub_processes[address] = (await self.wait_sub_pools_ready([task]))[0][0]
501
+
502
+ if self._auto_recover == "actor":
503
+ # need to recover all created actors
504
+ for _, message in self._allocated_actors[address].values():
505
+ create_actor_message: CreateActorMessage = message # type: ignore
506
+ await self.call(address, create_actor_message)
507
+
508
+ async def start(self):
509
+ await super().start()
510
+ await self.start_monitor()
511
+
512
+
513
+ @_register_message_handler
514
+ class SubActorPool(SubActorPoolBase):
515
+ pass