xoscar 0.7.15__cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xoscar might be problematic. Click here for more details.

Files changed (88) hide show
  1. xoscar/__init__.py +61 -0
  2. xoscar/_utils.cpython-39-aarch64-linux-gnu.so +0 -0
  3. xoscar/_utils.pxd +36 -0
  4. xoscar/_utils.pyx +246 -0
  5. xoscar/_version.py +693 -0
  6. xoscar/aio/__init__.py +16 -0
  7. xoscar/aio/base.py +86 -0
  8. xoscar/aio/file.py +59 -0
  9. xoscar/aio/lru.py +228 -0
  10. xoscar/aio/parallelism.py +39 -0
  11. xoscar/api.py +527 -0
  12. xoscar/backend.py +67 -0
  13. xoscar/backends/__init__.py +14 -0
  14. xoscar/backends/allocate_strategy.py +160 -0
  15. xoscar/backends/communication/__init__.py +30 -0
  16. xoscar/backends/communication/base.py +315 -0
  17. xoscar/backends/communication/core.py +69 -0
  18. xoscar/backends/communication/dummy.py +253 -0
  19. xoscar/backends/communication/errors.py +20 -0
  20. xoscar/backends/communication/socket.py +444 -0
  21. xoscar/backends/communication/ucx.py +538 -0
  22. xoscar/backends/communication/utils.py +97 -0
  23. xoscar/backends/config.py +157 -0
  24. xoscar/backends/context.py +437 -0
  25. xoscar/backends/core.py +304 -0
  26. xoscar/backends/indigen/__init__.py +16 -0
  27. xoscar/backends/indigen/__main__.py +19 -0
  28. xoscar/backends/indigen/backend.py +51 -0
  29. xoscar/backends/indigen/driver.py +26 -0
  30. xoscar/backends/indigen/fate_sharing.py +221 -0
  31. xoscar/backends/indigen/pool.py +465 -0
  32. xoscar/backends/indigen/shared_memory.py +548 -0
  33. xoscar/backends/message.cpython-39-aarch64-linux-gnu.so +0 -0
  34. xoscar/backends/message.pyx +646 -0
  35. xoscar/backends/pool.py +1625 -0
  36. xoscar/backends/router.py +285 -0
  37. xoscar/backends/test/__init__.py +16 -0
  38. xoscar/backends/test/backend.py +38 -0
  39. xoscar/backends/test/pool.py +197 -0
  40. xoscar/batch.py +256 -0
  41. xoscar/collective/__init__.py +27 -0
  42. xoscar/collective/common.py +102 -0
  43. xoscar/collective/core.py +737 -0
  44. xoscar/collective/process_group.py +687 -0
  45. xoscar/collective/utils.py +41 -0
  46. xoscar/collective/xoscar_pygloo.cpython-39-aarch64-linux-gnu.so +0 -0
  47. xoscar/constants.py +23 -0
  48. xoscar/context.cpython-39-aarch64-linux-gnu.so +0 -0
  49. xoscar/context.pxd +21 -0
  50. xoscar/context.pyx +368 -0
  51. xoscar/core.cpython-39-aarch64-linux-gnu.so +0 -0
  52. xoscar/core.pxd +51 -0
  53. xoscar/core.pyx +664 -0
  54. xoscar/debug.py +188 -0
  55. xoscar/driver.py +42 -0
  56. xoscar/errors.py +63 -0
  57. xoscar/libcpp.pxd +31 -0
  58. xoscar/metrics/__init__.py +21 -0
  59. xoscar/metrics/api.py +288 -0
  60. xoscar/metrics/backends/__init__.py +13 -0
  61. xoscar/metrics/backends/console/__init__.py +13 -0
  62. xoscar/metrics/backends/console/console_metric.py +82 -0
  63. xoscar/metrics/backends/metric.py +149 -0
  64. xoscar/metrics/backends/prometheus/__init__.py +13 -0
  65. xoscar/metrics/backends/prometheus/prometheus_metric.py +70 -0
  66. xoscar/nvutils.py +717 -0
  67. xoscar/profiling.py +260 -0
  68. xoscar/serialization/__init__.py +20 -0
  69. xoscar/serialization/aio.py +142 -0
  70. xoscar/serialization/core.cpython-39-aarch64-linux-gnu.so +0 -0
  71. xoscar/serialization/core.pxd +28 -0
  72. xoscar/serialization/core.pyx +944 -0
  73. xoscar/serialization/cuda.py +111 -0
  74. xoscar/serialization/exception.py +48 -0
  75. xoscar/serialization/mlx.py +67 -0
  76. xoscar/serialization/numpy.py +82 -0
  77. xoscar/serialization/pyfury.py +37 -0
  78. xoscar/serialization/scipy.py +72 -0
  79. xoscar/utils.py +517 -0
  80. xoscar/virtualenv/__init__.py +34 -0
  81. xoscar/virtualenv/core.py +268 -0
  82. xoscar/virtualenv/platform.py +53 -0
  83. xoscar/virtualenv/utils.py +100 -0
  84. xoscar/virtualenv/uv.py +318 -0
  85. xoscar-0.7.15.dist-info/METADATA +226 -0
  86. xoscar-0.7.15.dist-info/RECORD +88 -0
  87. xoscar-0.7.15.dist-info/WHEEL +6 -0
  88. xoscar-0.7.15.dist-info/top_level.txt +2 -0
@@ -0,0 +1,465 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ # derived from copyright 1999-2021 Alibaba Group Holding Ltd.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import asyncio.subprocess
20
+ import configparser
21
+ import itertools
22
+ import logging.config
23
+ import os
24
+ import pickle
25
+ import random
26
+ import signal
27
+ import struct
28
+ import sys
29
+ import threading
30
+ import time
31
+ import uuid
32
+ from enum import IntEnum
33
+ from typing import List, Optional
34
+
35
+ import psutil
36
+
37
+ from ..._utils import reset_id_random_seed
38
+ from ...utils import ensure_coverage
39
+ from ..config import ActorPoolConfig
40
+ from ..message import (
41
+ ControlMessage,
42
+ ControlMessageType,
43
+ CreateActorMessage,
44
+ new_message_id,
45
+ )
46
+ from ..pool import MainActorPoolBase, SubActorPoolBase, _register_message_handler
47
+ from . import shared_memory
48
+ from .fate_sharing import create_subprocess_exec
49
+
50
+ _SUBPROCESS_SHM_SIZE = 10240
51
+ _is_windows: bool = sys.platform.startswith("win")
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ class _ShmSeq(IntEnum):
57
+ INIT_PARAMS = 1
58
+ INIT_RESULT = 2
59
+
60
+
61
+ def _shm_put_object(seq: _ShmSeq, shm: shared_memory.SharedMemory, o: object):
62
+ serialized = pickle.dumps(o)
63
+ assert (
64
+ len(serialized) < _SUBPROCESS_SHM_SIZE - 8
65
+ ), f"Serialized object {o} is too long."
66
+ shm.buf[4:8] = struct.pack("<I", len(serialized))
67
+ shm.buf[8 : 8 + len(serialized)] = serialized
68
+ shm.buf[:4] = struct.pack("<I", seq)
69
+
70
+
71
+ def _shm_get_object(seq: _ShmSeq, shm: shared_memory.SharedMemory):
72
+ recv_seq = struct.unpack("<I", shm.buf[:4])[0]
73
+ if recv_seq != seq:
74
+ return
75
+ size = struct.unpack("<I", shm.buf[4:8])[0]
76
+ return pickle.loads(shm.buf[8 : 8 + size])
77
+
78
+
79
+ @_register_message_handler
80
+ class MainActorPool(MainActorPoolBase):
81
+ @classmethod
82
+ def get_external_addresses(
83
+ cls,
84
+ address: str,
85
+ n_process: int | None = None,
86
+ ports: list[int] | None = None,
87
+ schemes: list[Optional[str]] | None = None,
88
+ ):
89
+ """Get external address for every process"""
90
+ assert n_process is not None
91
+ if ":" in address:
92
+ host, port_str = address.rsplit(":", 1)
93
+ port = int(port_str)
94
+ if ports:
95
+ if len(ports) != n_process:
96
+ raise ValueError(
97
+ f"`ports` specified, but its count "
98
+ f"is not equal to `n_process`, "
99
+ f"number of ports: {len(ports)}, "
100
+ f"n_process: {n_process}"
101
+ )
102
+ sub_ports = ports
103
+ else:
104
+ sub_ports = [0] * n_process
105
+ else:
106
+ host = address
107
+ if ports and len(ports) != n_process + 1:
108
+ # ports specified, the first of which should be main port
109
+ raise ValueError(
110
+ f"`ports` specified, but its count "
111
+ f"is not equal to `n_process` + 1, "
112
+ f"number of ports: {len(ports)}, "
113
+ f"n_process + 1: {n_process + 1}"
114
+ )
115
+ elif not ports:
116
+ ports = [0] * (n_process + 1)
117
+ port = ports[0]
118
+ sub_ports = ports[1:]
119
+ if not schemes:
120
+ prefix_iter = itertools.repeat("")
121
+ else:
122
+ prefix_iter = [f"{scheme}://" if scheme else "" for scheme in schemes] # type: ignore
123
+ return [
124
+ f"{prefix}{host}:{port}"
125
+ for port, prefix in zip([port] + sub_ports, prefix_iter)
126
+ ]
127
+
128
+ @classmethod
129
+ def gen_internal_address(
130
+ cls, process_index: int, external_address: str | None = None
131
+ ) -> str | None:
132
+ if hasattr(asyncio, "start_unix_server"):
133
+ return f"unixsocket:///{process_index}"
134
+ else:
135
+ return external_address
136
+
137
+ @classmethod
138
+ async def start_sub_pool(
139
+ cls,
140
+ actor_pool_config: ActorPoolConfig,
141
+ process_index: int,
142
+ start_python: str | None = None,
143
+ ):
144
+ return await cls._create_sub_pool_from_parent(
145
+ actor_pool_config, process_index, start_python
146
+ )
147
+
148
+ @classmethod
149
+ async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
150
+ processes: list[asyncio.subprocess.Process] = []
151
+ ext_addresses = []
152
+ error = None
153
+ for task in create_pool_tasks:
154
+ process, address = await task
155
+ processes.append(process)
156
+ ext_addresses.append(address)
157
+ if error:
158
+ for p in processes:
159
+ # error happens, kill all subprocesses
160
+ p.kill()
161
+ raise error
162
+ return processes, ext_addresses
163
+
164
+ @classmethod
165
+ def _start_sub_pool_in_child(
166
+ cls,
167
+ shm_name: str,
168
+ ):
169
+ ensure_coverage()
170
+
171
+ shm = shared_memory.SharedMemory(shm_name, track=False)
172
+ try:
173
+ config = _shm_get_object(_ShmSeq.INIT_PARAMS, shm)
174
+ # Check Python version once.
175
+ sub_pool_python_version = config.pop("python_version", None)
176
+ if (
177
+ sub_pool_python_version is not None
178
+ and sub_pool_python_version != sys.hexversion
179
+ ):
180
+ logger.warning(
181
+ f"The sub pool is using a different Python version, you may encounter serialization issues."
182
+ f" sub pool: {sub_pool_python_version}, main pool: {sys.hexversion}"
183
+ )
184
+ actor_config = config["actor_pool_config"]
185
+ process_index = config["process_index"]
186
+ main_pool_pid = config["main_pool_pid"]
187
+
188
+ def _check_ppid():
189
+ while True:
190
+ try:
191
+ # We can't simply check if the os.getppid() equals with main_pool_pid,
192
+ # as the double fork may result in a new process as the parent.
193
+ psutil.Process(main_pool_pid)
194
+ except psutil.NoSuchProcess:
195
+ logger.error("Exit due to main pool %s exit.", main_pool_pid)
196
+ os._exit(233) # Special exit code for debugging.
197
+ except Exception as e:
198
+ logger.exception("Check ppid failed: %s", e)
199
+ time.sleep(10)
200
+
201
+ t = threading.Thread(target=_check_ppid, daemon=True)
202
+ t.start()
203
+
204
+ # make sure enough randomness for every sub pool
205
+ random.seed(uuid.uuid1().bytes)
206
+ reset_id_random_seed()
207
+
208
+ conf = actor_config.get_pool_config(process_index)
209
+ suspend_sigint = conf["suspend_sigint"]
210
+ if suspend_sigint:
211
+ signal.signal(signal.SIGINT, lambda *_: None)
212
+
213
+ logging_conf = conf["logging_conf"] or {}
214
+ if isinstance(logging_conf, configparser.RawConfigParser):
215
+ logging.config.fileConfig(logging_conf)
216
+ elif logging_conf.get("dict"):
217
+ logging.config.dictConfig(logging_conf["dict"])
218
+ elif logging_conf.get("file"):
219
+ logging.config.fileConfig(logging_conf["file"])
220
+ elif logging_conf.get("level"):
221
+ logging.getLogger("__main__").setLevel(logging_conf["level"])
222
+ logging.getLogger("xoscar").setLevel(logging_conf["level"])
223
+ if logging_conf.get("format"):
224
+ logging.basicConfig(format=logging_conf["format"])
225
+
226
+ use_uvloop = conf["use_uvloop"]
227
+ if use_uvloop:
228
+ import uvloop
229
+
230
+ asyncio.set_event_loop(uvloop.new_event_loop())
231
+ else:
232
+ asyncio.set_event_loop(asyncio.new_event_loop())
233
+
234
+ coro = cls._create_sub_pool(actor_config, process_index, main_pool_pid, shm)
235
+ asyncio.run(coro)
236
+ finally:
237
+ shm.close()
238
+
239
+ @classmethod
240
+ async def _create_sub_pool(
241
+ cls,
242
+ actor_config: ActorPoolConfig,
243
+ process_index: int,
244
+ main_pool_pid: int,
245
+ shm: shared_memory.SharedMemory,
246
+ ):
247
+ cur_pool_config = actor_config.get_pool_config(process_index)
248
+ env = cur_pool_config["env"]
249
+ if env:
250
+ os.environ.update(env)
251
+ pool = await SubActorPool.create(
252
+ {
253
+ "actor_pool_config": actor_config,
254
+ "process_index": process_index,
255
+ "main_pool_pid": main_pool_pid,
256
+ }
257
+ )
258
+ await pool.start()
259
+ _shm_put_object(_ShmSeq.INIT_RESULT, shm, cur_pool_config["external_address"])
260
+ await pool.join()
261
+
262
+ @staticmethod
263
+ async def _create_sub_pool_from_parent(
264
+ actor_pool_config: ActorPoolConfig,
265
+ process_index: int,
266
+ start_python: str | None = None,
267
+ ):
268
+ # We check the Python version in _shm_get_object to make it faster,
269
+ # as in most cases the Python versions are the same.
270
+ if start_python is None:
271
+ start_python = sys.executable
272
+
273
+ external_addresses: List | None = None
274
+ shm = shared_memory.SharedMemory(
275
+ create=True, size=_SUBPROCESS_SHM_SIZE, track=False
276
+ )
277
+ try:
278
+ _shm_put_object(
279
+ _ShmSeq.INIT_PARAMS,
280
+ shm,
281
+ {
282
+ "actor_pool_config": actor_pool_config,
283
+ "process_index": process_index,
284
+ "main_pool_pid": os.getpid(),
285
+ "python_version": sys.hexversion,
286
+ },
287
+ )
288
+ cmd = [
289
+ start_python,
290
+ "-m",
291
+ "xoscar.backends.indigen",
292
+ "start_sub_pool",
293
+ "-sn",
294
+ shm.name,
295
+ ]
296
+ # We need to inherit the parent environment to ensure the subprocess works correctly on Windows.
297
+ new_env = dict(os.environ)
298
+ env = actor_pool_config.get_pool_config(process_index).get("env") or {}
299
+ new_env.update(env)
300
+ logger.info("Creating sub pool via command: %s", cmd)
301
+ process = await create_subprocess_exec(*cmd, env=new_env)
302
+
303
+ def _get_external_addresses():
304
+ try:
305
+ nonlocal external_addresses
306
+ while (
307
+ shm
308
+ and shm.buf is not None
309
+ and not (
310
+ external_addresses := _shm_get_object(
311
+ _ShmSeq.INIT_RESULT, shm
312
+ )
313
+ )
314
+ ):
315
+ time.sleep(0.1)
316
+ except asyncio.CancelledError:
317
+ pass
318
+
319
+ _, unfinished = await asyncio.wait(
320
+ [
321
+ asyncio.create_task(process.wait()),
322
+ asyncio.create_task(asyncio.to_thread(_get_external_addresses)),
323
+ ],
324
+ return_when=asyncio.FIRST_COMPLETED,
325
+ )
326
+ for t in unfinished:
327
+ t.cancel()
328
+ finally:
329
+ shm.close()
330
+ shm.unlink()
331
+ if external_addresses is None:
332
+ raise OSError(f"Start sub pool failed, returncode: {process.returncode}")
333
+ return process, external_addresses
334
+
335
+ async def append_sub_pool(
336
+ self,
337
+ label: str | None = None,
338
+ internal_address: str | None = None,
339
+ external_address: str | None = None,
340
+ env: dict | None = None,
341
+ modules: list[str] | None = None,
342
+ suspend_sigint: bool | None = None,
343
+ use_uvloop: bool | None = None,
344
+ logging_conf: dict | None = None,
345
+ start_python: str | None = None,
346
+ kwargs: dict | None = None,
347
+ ):
348
+ # external_address has port 0, subprocess will bind random port.
349
+ external_address = (
350
+ external_address
351
+ or MainActorPool.get_external_addresses(self.external_address, n_process=1)[
352
+ -1
353
+ ]
354
+ )
355
+
356
+ # use last process index's logging_conf and use_uv_loop config if not provide
357
+ actor_pool_config = self._config.as_dict()
358
+ last_process_index = self._config.get_process_indexes()[-1]
359
+ last_logging_conf = actor_pool_config["pools"][last_process_index][
360
+ "logging_conf"
361
+ ]
362
+ last_use_uv_loop = actor_pool_config["pools"][last_process_index]["use_uvloop"]
363
+ _logging_conf = logging_conf or last_logging_conf
364
+ _use_uv_loop = use_uvloop if use_uvloop is not None else last_use_uv_loop
365
+
366
+ process_index = next(MainActorPool.process_index_gen(external_address))
367
+ internal_address = internal_address or MainActorPool.gen_internal_address(
368
+ process_index, external_address
369
+ )
370
+
371
+ self._config.add_pool_conf(
372
+ process_index,
373
+ label,
374
+ internal_address,
375
+ external_address,
376
+ env,
377
+ modules,
378
+ suspend_sigint,
379
+ _use_uv_loop,
380
+ _logging_conf,
381
+ kwargs,
382
+ )
383
+
384
+ process, external_addresses = await self._create_sub_pool_from_parent(
385
+ self._config, process_index, start_python
386
+ )
387
+
388
+ self._config.reset_pool_external_address(process_index, external_addresses[0])
389
+ self.attach_sub_process(external_addresses[0], process)
390
+
391
+ control_message = ControlMessage(
392
+ message_id=new_message_id(),
393
+ address=self.external_address,
394
+ control_message_type=ControlMessageType.sync_config,
395
+ content=self._config,
396
+ )
397
+ await self.handle_control_command(control_message)
398
+ # The actual port will return in process_status.
399
+ return external_addresses[0]
400
+
401
+ async def remove_sub_pool(
402
+ self, external_address: str, timeout: float | None = None, force: bool = False
403
+ ):
404
+ process = self.sub_processes[external_address]
405
+ process_index = self._config.get_process_index(external_address)
406
+ del self.sub_processes[external_address]
407
+ self._config.remove_pool_config(process_index)
408
+ await self.stop_sub_pool(external_address, process, timeout, force)
409
+
410
+ control_message = ControlMessage(
411
+ message_id=new_message_id(),
412
+ address=self.external_address,
413
+ control_message_type=ControlMessageType.sync_config,
414
+ content=self._config,
415
+ )
416
+ await self.handle_control_command(control_message)
417
+
418
+ async def kill_sub_pool(
419
+ self, process: asyncio.subprocess.Process, force: bool = False
420
+ ):
421
+ try:
422
+ p = psutil.Process(process.pid)
423
+ except psutil.NoSuchProcess:
424
+ return
425
+
426
+ if not force: # pragma: no cover
427
+ p.terminate()
428
+ try:
429
+ p.wait(5)
430
+ except psutil.TimeoutExpired:
431
+ pass
432
+
433
+ count = 0
434
+ while p.is_running() and count < 3:
435
+ count += 1
436
+ p.kill()
437
+ if not p.is_running():
438
+ return
439
+ logger.info("Sub pool can't be killed: %s", p)
440
+ time.sleep(0.1)
441
+
442
+ async def is_sub_pool_alive(self, process: asyncio.subprocess.Process):
443
+ return process.returncode is None
444
+
445
+ async def recover_sub_pool(self, address: str):
446
+ process_index = self._config.get_process_index(address)
447
+ # process dead, restart it
448
+ # remember always use spawn to recover sub pool
449
+ task = asyncio.create_task(self.start_sub_pool(self._config, process_index))
450
+ self.sub_processes[address] = (await self.wait_sub_pools_ready([task]))[0][0]
451
+
452
+ if self._auto_recover == "actor":
453
+ # need to recover all created actors
454
+ for _, message in self._allocated_actors[address].values():
455
+ create_actor_message: CreateActorMessage = message # type: ignore
456
+ await self.call(address, create_actor_message)
457
+
458
+ async def start(self):
459
+ await super().start()
460
+ await self.start_monitor()
461
+
462
+
463
+ @_register_message_handler
464
+ class SubActorPool(SubActorPoolBase):
465
+ pass