xoscar 0.8.0__cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xoscar might be problematic. Click here for more details.

Files changed (93) hide show
  1. xoscar/__init__.py +61 -0
  2. xoscar/_utils.cpython-313-x86_64-linux-gnu.so +0 -0
  3. xoscar/_utils.pxd +36 -0
  4. xoscar/_utils.pyx +246 -0
  5. xoscar/_version.py +693 -0
  6. xoscar/aio/__init__.py +16 -0
  7. xoscar/aio/base.py +86 -0
  8. xoscar/aio/file.py +59 -0
  9. xoscar/aio/lru.py +228 -0
  10. xoscar/aio/parallelism.py +39 -0
  11. xoscar/api.py +527 -0
  12. xoscar/backend.py +67 -0
  13. xoscar/backends/__init__.py +14 -0
  14. xoscar/backends/allocate_strategy.py +160 -0
  15. xoscar/backends/communication/__init__.py +30 -0
  16. xoscar/backends/communication/base.py +315 -0
  17. xoscar/backends/communication/core.py +69 -0
  18. xoscar/backends/communication/dummy.py +253 -0
  19. xoscar/backends/communication/errors.py +20 -0
  20. xoscar/backends/communication/socket.py +444 -0
  21. xoscar/backends/communication/ucx.py +538 -0
  22. xoscar/backends/communication/utils.py +97 -0
  23. xoscar/backends/config.py +157 -0
  24. xoscar/backends/context.py +437 -0
  25. xoscar/backends/core.py +352 -0
  26. xoscar/backends/indigen/__init__.py +16 -0
  27. xoscar/backends/indigen/__main__.py +19 -0
  28. xoscar/backends/indigen/backend.py +51 -0
  29. xoscar/backends/indigen/driver.py +26 -0
  30. xoscar/backends/indigen/fate_sharing.py +221 -0
  31. xoscar/backends/indigen/pool.py +509 -0
  32. xoscar/backends/indigen/shared_memory.py +548 -0
  33. xoscar/backends/message.cpython-313-x86_64-linux-gnu.so +0 -0
  34. xoscar/backends/message.pyi +255 -0
  35. xoscar/backends/message.pyx +646 -0
  36. xoscar/backends/pool.py +1630 -0
  37. xoscar/backends/router.py +285 -0
  38. xoscar/backends/test/__init__.py +16 -0
  39. xoscar/backends/test/backend.py +38 -0
  40. xoscar/backends/test/pool.py +233 -0
  41. xoscar/batch.py +256 -0
  42. xoscar/collective/__init__.py +27 -0
  43. xoscar/collective/backend/__init__.py +13 -0
  44. xoscar/collective/backend/nccl_backend.py +160 -0
  45. xoscar/collective/common.py +102 -0
  46. xoscar/collective/core.py +737 -0
  47. xoscar/collective/process_group.py +687 -0
  48. xoscar/collective/utils.py +41 -0
  49. xoscar/collective/xoscar_pygloo.cpython-313-x86_64-linux-gnu.so +0 -0
  50. xoscar/collective/xoscar_pygloo.pyi +239 -0
  51. xoscar/constants.py +23 -0
  52. xoscar/context.cpython-313-x86_64-linux-gnu.so +0 -0
  53. xoscar/context.pxd +21 -0
  54. xoscar/context.pyx +368 -0
  55. xoscar/core.cpython-313-x86_64-linux-gnu.so +0 -0
  56. xoscar/core.pxd +51 -0
  57. xoscar/core.pyx +664 -0
  58. xoscar/debug.py +188 -0
  59. xoscar/driver.py +42 -0
  60. xoscar/errors.py +63 -0
  61. xoscar/libcpp.pxd +31 -0
  62. xoscar/metrics/__init__.py +21 -0
  63. xoscar/metrics/api.py +288 -0
  64. xoscar/metrics/backends/__init__.py +13 -0
  65. xoscar/metrics/backends/console/__init__.py +13 -0
  66. xoscar/metrics/backends/console/console_metric.py +82 -0
  67. xoscar/metrics/backends/metric.py +149 -0
  68. xoscar/metrics/backends/prometheus/__init__.py +13 -0
  69. xoscar/metrics/backends/prometheus/prometheus_metric.py +70 -0
  70. xoscar/nvutils.py +717 -0
  71. xoscar/profiling.py +260 -0
  72. xoscar/serialization/__init__.py +20 -0
  73. xoscar/serialization/aio.py +142 -0
  74. xoscar/serialization/core.cpython-313-x86_64-linux-gnu.so +0 -0
  75. xoscar/serialization/core.pxd +28 -0
  76. xoscar/serialization/core.pyi +57 -0
  77. xoscar/serialization/core.pyx +944 -0
  78. xoscar/serialization/cuda.py +111 -0
  79. xoscar/serialization/exception.py +48 -0
  80. xoscar/serialization/mlx.py +67 -0
  81. xoscar/serialization/numpy.py +82 -0
  82. xoscar/serialization/pyfury.py +37 -0
  83. xoscar/serialization/scipy.py +72 -0
  84. xoscar/utils.py +522 -0
  85. xoscar/virtualenv/__init__.py +34 -0
  86. xoscar/virtualenv/core.py +268 -0
  87. xoscar/virtualenv/platform.py +56 -0
  88. xoscar/virtualenv/utils.py +100 -0
  89. xoscar/virtualenv/uv.py +321 -0
  90. xoscar-0.8.0.dist-info/METADATA +229 -0
  91. xoscar-0.8.0.dist-info/RECORD +93 -0
  92. xoscar-0.8.0.dist-info/WHEEL +6 -0
  93. xoscar-0.8.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,509 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ # derived from copyright 1999-2021 Alibaba Group Holding Ltd.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import asyncio.subprocess
20
+ import configparser
21
+ import itertools
22
+ import logging.config
23
+ import os
24
+ import pickle
25
+ import random
26
+ import signal
27
+ import struct
28
+ import sys
29
+ import threading
30
+ import time
31
+ import uuid
32
+ from enum import IntEnum
33
+ from typing import List, Optional
34
+
35
+ import psutil
36
+
37
+ from ..._utils import reset_id_random_seed
38
+ from ...utils import ensure_coverage
39
+ from ..config import ActorPoolConfig
40
+ from ..message import (
41
+ ControlMessage,
42
+ ControlMessageType,
43
+ CreateActorMessage,
44
+ new_message_id,
45
+ )
46
+ from ..pool import MainActorPoolBase, SubActorPoolBase, _register_message_handler
47
+ from . import shared_memory
48
+ from .fate_sharing import create_subprocess_exec
49
+
50
+ _SUBPROCESS_SHM_SIZE = 10240
51
+ _is_windows: bool = sys.platform.startswith("win")
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ class _ShmSeq(IntEnum):
57
+ INIT_PARAMS = 1
58
+ INIT_RESULT = 2
59
+
60
+
61
+ def _shm_put_object(seq: _ShmSeq, shm: shared_memory.SharedMemory, o: object):
62
+ serialized = pickle.dumps(o)
63
+ assert (
64
+ len(serialized) < _SUBPROCESS_SHM_SIZE - 8
65
+ ), f"Serialized object {o} is too long."
66
+ shm.buf[4:8] = struct.pack("<I", len(serialized))
67
+ shm.buf[8 : 8 + len(serialized)] = serialized
68
+ shm.buf[:4] = struct.pack("<I", seq)
69
+
70
+
71
+ def _shm_get_object(seq: _ShmSeq, shm: shared_memory.SharedMemory):
72
+ recv_seq = struct.unpack("<I", shm.buf[:4])[0]
73
+ if recv_seq != seq:
74
+ return
75
+ size = struct.unpack("<I", shm.buf[4:8])[0]
76
+ return pickle.loads(shm.buf[8 : 8 + size])
77
+
78
+
79
+ @_register_message_handler
80
+ class MainActorPool(MainActorPoolBase):
81
+ @classmethod
82
+ def get_external_addresses(
83
+ cls,
84
+ address: str,
85
+ n_process: int | None = None,
86
+ ports: list[int] | None = None,
87
+ schemes: list[Optional[str]] | None = None,
88
+ ):
89
+ """Get external address for every process"""
90
+ assert n_process is not None
91
+ if ":" in address:
92
+ host, port_str = address.rsplit(":", 1)
93
+ port = int(port_str)
94
+ if ports:
95
+ if len(ports) != n_process:
96
+ raise ValueError(
97
+ f"`ports` specified, but its count "
98
+ f"is not equal to `n_process`, "
99
+ f"number of ports: {len(ports)}, "
100
+ f"n_process: {n_process}"
101
+ )
102
+ sub_ports = ports
103
+ else:
104
+ sub_ports = [0] * n_process
105
+ else:
106
+ host = address
107
+ if ports and len(ports) != n_process + 1:
108
+ # ports specified, the first of which should be main port
109
+ raise ValueError(
110
+ f"`ports` specified, but its count "
111
+ f"is not equal to `n_process` + 1, "
112
+ f"number of ports: {len(ports)}, "
113
+ f"n_process + 1: {n_process + 1}"
114
+ )
115
+ elif not ports:
116
+ ports = [0] * (n_process + 1)
117
+ port = ports[0]
118
+ sub_ports = ports[1:]
119
+ if not schemes:
120
+ prefix_iter = itertools.repeat("")
121
+ else:
122
+ prefix_iter = [f"{scheme}://" if scheme else "" for scheme in schemes] # type: ignore
123
+ return [
124
+ f"{prefix}{host}:{port}"
125
+ for port, prefix in zip([port] + sub_ports, prefix_iter)
126
+ ]
127
+
128
+ @classmethod
129
+ def gen_internal_address(
130
+ cls, process_index: int, external_address: str | None = None
131
+ ) -> str | None:
132
+ if hasattr(asyncio, "start_unix_server"):
133
+ return f"unixsocket:///{process_index}"
134
+ else:
135
+ return external_address
136
+
137
+ @classmethod
138
+ async def start_sub_pool(
139
+ cls,
140
+ actor_pool_config: ActorPoolConfig,
141
+ process_index: int,
142
+ start_python: str | None = None,
143
+ ):
144
+ return await cls._create_sub_pool_from_parent(
145
+ actor_pool_config, process_index, start_python
146
+ )
147
+
148
+ @classmethod
149
+ async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
150
+ processes: list[asyncio.subprocess.Process] = []
151
+ ext_addresses = []
152
+ error = None
153
+ for task in create_pool_tasks:
154
+ process, address = await task
155
+ processes.append(process)
156
+ ext_addresses.append(address)
157
+ if error:
158
+ for p in processes:
159
+ # error happens, kill all subprocesses
160
+ p.kill()
161
+ raise error
162
+ return processes, ext_addresses
163
+
164
+ @classmethod
165
+ def _start_sub_pool_in_child(
166
+ cls,
167
+ shm_name: str,
168
+ ):
169
+ ensure_coverage()
170
+
171
+ shm = shared_memory.SharedMemory(shm_name, track=False)
172
+ try:
173
+ config = _shm_get_object(_ShmSeq.INIT_PARAMS, shm)
174
+ # Check Python version once.
175
+ sub_pool_python_version = config.pop("python_version", None)
176
+ if (
177
+ sub_pool_python_version is not None
178
+ and sub_pool_python_version != sys.hexversion
179
+ ):
180
+ logger.warning(
181
+ f"The sub pool is using a different Python version, you may encounter serialization issues."
182
+ f" sub pool: {sub_pool_python_version}, main pool: {sys.hexversion}"
183
+ )
184
+ actor_config = config["actor_pool_config"]
185
+ process_index = config["process_index"]
186
+ main_pool_pid = config["main_pool_pid"]
187
+
188
+ def _check_ppid():
189
+ while True:
190
+ try:
191
+ # We can't simply check if the os.getppid() equals with main_pool_pid,
192
+ # as the double fork may result in a new process as the parent.
193
+ psutil.Process(main_pool_pid)
194
+ except psutil.NoSuchProcess:
195
+ logger.error("Exit due to main pool %s exit.", main_pool_pid)
196
+ os._exit(233) # Special exit code for debugging.
197
+ except Exception as e:
198
+ logger.exception("Check ppid failed: %s", e)
199
+ time.sleep(10)
200
+
201
+ t = threading.Thread(target=_check_ppid, daemon=True)
202
+ t.start()
203
+
204
+ # make sure enough randomness for every sub pool
205
+ random.seed(uuid.uuid1().bytes)
206
+ reset_id_random_seed()
207
+
208
+ conf = actor_config.get_pool_config(process_index)
209
+ suspend_sigint = conf["suspend_sigint"]
210
+ if suspend_sigint:
211
+ signal.signal(signal.SIGINT, lambda *_: None)
212
+
213
+ logging_conf = conf["logging_conf"] or {}
214
+ if isinstance(logging_conf, configparser.RawConfigParser):
215
+ logging.config.fileConfig(logging_conf)
216
+ elif logging_conf.get("dict"):
217
+ logging.config.dictConfig(logging_conf["dict"])
218
+ elif logging_conf.get("file"):
219
+ logging.config.fileConfig(logging_conf["file"])
220
+ elif logging_conf.get("level"):
221
+ logging.getLogger("__main__").setLevel(logging_conf["level"])
222
+ logging.getLogger("xoscar").setLevel(logging_conf["level"])
223
+ if logging_conf.get("format"):
224
+ logging.basicConfig(format=logging_conf["format"])
225
+
226
+ use_uvloop = conf["use_uvloop"]
227
+ if use_uvloop:
228
+ import uvloop
229
+
230
+ asyncio.set_event_loop(uvloop.new_event_loop())
231
+ else:
232
+ asyncio.set_event_loop(asyncio.new_event_loop())
233
+
234
+ coro = cls._create_sub_pool(actor_config, process_index, main_pool_pid, shm)
235
+ asyncio.run(coro)
236
+ finally:
237
+ shm.close()
238
+
239
+ @classmethod
240
+ async def _create_sub_pool(
241
+ cls,
242
+ actor_config: ActorPoolConfig,
243
+ process_index: int,
244
+ main_pool_pid: int,
245
+ shm: shared_memory.SharedMemory,
246
+ ):
247
+ cur_pool_config = actor_config.get_pool_config(process_index)
248
+ env = cur_pool_config["env"]
249
+ if env:
250
+ os.environ.update(env)
251
+ pool = await SubActorPool.create(
252
+ {
253
+ "actor_pool_config": actor_config,
254
+ "process_index": process_index,
255
+ "main_pool_pid": main_pool_pid,
256
+ }
257
+ )
258
+ await pool.start()
259
+ _shm_put_object(_ShmSeq.INIT_RESULT, shm, cur_pool_config["external_address"])
260
+ await pool.join()
261
+
262
+ @staticmethod
263
+ async def _create_sub_pool_from_parent(
264
+ actor_pool_config: ActorPoolConfig,
265
+ process_index: int,
266
+ start_python: str | None = None,
267
+ ):
268
+ # We check the Python version in _shm_get_object to make it faster,
269
+ # as in most cases the Python versions are the same.
270
+ if start_python is None:
271
+ start_python = sys.executable
272
+
273
+ external_addresses: List | None = None
274
+ shm = shared_memory.SharedMemory(
275
+ create=True, size=_SUBPROCESS_SHM_SIZE, track=False
276
+ )
277
+ try:
278
+ _shm_put_object(
279
+ _ShmSeq.INIT_PARAMS,
280
+ shm,
281
+ {
282
+ "actor_pool_config": actor_pool_config,
283
+ "process_index": process_index,
284
+ "main_pool_pid": os.getpid(),
285
+ "python_version": sys.hexversion,
286
+ },
287
+ )
288
+ cmd = [
289
+ start_python,
290
+ "-m",
291
+ "xoscar.backends.indigen",
292
+ "start_sub_pool",
293
+ "-sn",
294
+ shm.name,
295
+ ]
296
+ # We need to inherit the parent environment to ensure the subprocess works correctly on Windows.
297
+ new_env = dict(os.environ)
298
+ env = actor_pool_config.get_pool_config(process_index).get("env") or {}
299
+ new_env.update(env)
300
+ logger.info("Creating sub pool via command: %s", cmd)
301
+ process = await create_subprocess_exec(*cmd, env=new_env)
302
+
303
+ def _get_external_addresses():
304
+ try:
305
+ nonlocal external_addresses
306
+ while (
307
+ shm
308
+ and shm.buf is not None
309
+ and not (
310
+ external_addresses := _shm_get_object(
311
+ _ShmSeq.INIT_RESULT, shm
312
+ )
313
+ )
314
+ ):
315
+ time.sleep(0.1)
316
+ except asyncio.CancelledError:
317
+ pass
318
+
319
+ _, unfinished = await asyncio.wait(
320
+ [
321
+ asyncio.create_task(process.wait()),
322
+ asyncio.create_task(asyncio.to_thread(_get_external_addresses)),
323
+ ],
324
+ return_when=asyncio.FIRST_COMPLETED,
325
+ )
326
+ for t in unfinished:
327
+ t.cancel()
328
+ finally:
329
+ shm.close()
330
+ shm.unlink()
331
+ if external_addresses is None:
332
+ raise OSError(f"Start sub pool failed, returncode: {process.returncode}")
333
+ return process, external_addresses
334
+
335
+ async def append_sub_pool(
336
+ self,
337
+ label: str | None = None,
338
+ internal_address: str | None = None,
339
+ external_address: str | None = None,
340
+ env: dict | None = None,
341
+ modules: list[str] | None = None,
342
+ suspend_sigint: bool | None = None,
343
+ use_uvloop: bool | None = None,
344
+ logging_conf: dict | None = None,
345
+ start_python: str | None = None,
346
+ kwargs: dict | None = None,
347
+ ):
348
+ # external_address has port 0, subprocess will bind random port.
349
+ external_address = (
350
+ external_address
351
+ or MainActorPool.get_external_addresses(self.external_address, n_process=1)[
352
+ -1
353
+ ]
354
+ )
355
+
356
+ # use last process index's logging_conf and use_uv_loop config if not provide
357
+ actor_pool_config = self._config.as_dict()
358
+ last_process_index = self._config.get_process_indexes()[-1]
359
+ last_logging_conf = actor_pool_config["pools"][last_process_index][
360
+ "logging_conf"
361
+ ]
362
+ last_use_uv_loop = actor_pool_config["pools"][last_process_index]["use_uvloop"]
363
+ _logging_conf = logging_conf or last_logging_conf
364
+ _use_uv_loop = use_uvloop if use_uvloop is not None else last_use_uv_loop
365
+
366
+ process_index = next(MainActorPool.process_index_gen(external_address))
367
+ internal_address = internal_address or MainActorPool.gen_internal_address(
368
+ process_index, external_address
369
+ )
370
+
371
+ self._config.add_pool_conf(
372
+ process_index,
373
+ label,
374
+ internal_address,
375
+ external_address,
376
+ env,
377
+ modules,
378
+ suspend_sigint,
379
+ _use_uv_loop,
380
+ _logging_conf,
381
+ kwargs,
382
+ )
383
+
384
+ process, external_addresses = await self._create_sub_pool_from_parent(
385
+ self._config, process_index, start_python
386
+ )
387
+
388
+ self._config.reset_pool_external_address(process_index, external_addresses[0])
389
+ self.attach_sub_process(external_addresses[0], process)
390
+
391
+ control_message = ControlMessage(
392
+ message_id=new_message_id(),
393
+ address=self.external_address,
394
+ control_message_type=ControlMessageType.sync_config,
395
+ content=self._config,
396
+ )
397
+ await self.handle_control_command(control_message)
398
+ # The actual port will return in process_status.
399
+ return external_addresses[0]
400
+
401
+ async def remove_sub_pool(
402
+ self, external_address: str, timeout: float | None = None, force: bool = False
403
+ ):
404
+ process = self.sub_processes[external_address]
405
+ process_index = self._config.get_process_index(external_address)
406
+ del self.sub_processes[external_address]
407
+ self._config.remove_pool_config(process_index)
408
+ await self.stop_sub_pool(external_address, process, timeout, force)
409
+
410
+ control_message = ControlMessage(
411
+ message_id=new_message_id(),
412
+ address=self.external_address,
413
+ control_message_type=ControlMessageType.sync_config,
414
+ content=self._config,
415
+ )
416
+ await self.handle_control_command(control_message)
417
+
418
+ async def kill_sub_pool(
419
+ self, process: asyncio.subprocess.Process, force: bool = False
420
+ ):
421
+ # First, try to terminate the process gracefully
422
+ if not force:
423
+ try:
424
+ process.terminate()
425
+ # Wait for graceful termination
426
+ try:
427
+ await asyncio.wait_for(process.wait(), timeout=2.0)
428
+ except asyncio.TimeoutError:
429
+ # Process didn't terminate gracefully, force kill
430
+ force = True
431
+ except ProcessLookupError:
432
+ # Process already terminated
433
+ pass
434
+
435
+ # Force kill if needed or if graceful termination failed
436
+ if force:
437
+ try:
438
+ process.kill()
439
+ except ProcessLookupError:
440
+ # Process already dead
441
+ pass
442
+
443
+ # Ensure process is completely terminated and cleaned up
444
+ try:
445
+ # Wait for process to complete
446
+ if process.returncode is None:
447
+ try:
448
+ await asyncio.wait_for(process.wait(), timeout=5.0)
449
+ except asyncio.TimeoutError:
450
+ pass
451
+ except ProcessLookupError:
452
+ # Process already terminated
453
+ pass
454
+
455
+ # Python 3.13 specific cleanup for waitpid threads
456
+ if sys.version_info >= (3, 13):
457
+ try:
458
+ # Close the transport to clean up waitpid thread
459
+ if hasattr(process, "_transport") and process._transport:
460
+ process._transport.close()
461
+ # Also try to close the pipe transport if it exists
462
+ if hasattr(process, "_pipes") and process._pipes:
463
+ for pipe in process._pipes.values():
464
+ if hasattr(pipe, "close"):
465
+ pipe.close()
466
+ except Exception:
467
+ # Ignore errors during cleanup
468
+ pass
469
+
470
+ # Additional cleanup using psutil to ensure process tree is terminated
471
+ try:
472
+ p = psutil.Process(process.pid)
473
+ if p.is_running():
474
+ # Kill the entire process tree
475
+ for child in p.children(recursive=True):
476
+ try:
477
+ child.kill()
478
+ except psutil.NoSuchProcess:
479
+ pass
480
+ p.kill()
481
+ p.wait(timeout=2.0)
482
+ except (psutil.NoSuchProcess, psutil.TimeoutExpired):
483
+ # Process already dead or couldn't be killed
484
+ pass
485
+
486
+ async def is_sub_pool_alive(self, process: asyncio.subprocess.Process):
487
+ return process.returncode is None
488
+
489
+ async def recover_sub_pool(self, address: str):
490
+ process_index = self._config.get_process_index(address)
491
+ # process dead, restart it
492
+ # remember always use spawn to recover sub pool
493
+ task = asyncio.create_task(self.start_sub_pool(self._config, process_index))
494
+ self.sub_processes[address] = (await self.wait_sub_pools_ready([task]))[0][0]
495
+
496
+ if self._auto_recover == "actor":
497
+ # need to recover all created actors
498
+ for _, message in self._allocated_actors[address].values():
499
+ create_actor_message: CreateActorMessage = message # type: ignore
500
+ await self.call(address, create_actor_message)
501
+
502
+ async def start(self):
503
+ await super().start()
504
+ await self.start_monitor()
505
+
506
+
507
+ @_register_message_handler
508
+ class SubActorPool(SubActorPoolBase):
509
+ pass