xoscar 0.9.0__cp312-cp312-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xoscar/__init__.py +61 -0
- xoscar/_utils.cpython-312-darwin.so +0 -0
- xoscar/_utils.pxd +36 -0
- xoscar/_utils.pyx +246 -0
- xoscar/_version.py +693 -0
- xoscar/aio/__init__.py +16 -0
- xoscar/aio/base.py +86 -0
- xoscar/aio/file.py +59 -0
- xoscar/aio/lru.py +228 -0
- xoscar/aio/parallelism.py +39 -0
- xoscar/api.py +527 -0
- xoscar/backend.py +67 -0
- xoscar/backends/__init__.py +14 -0
- xoscar/backends/allocate_strategy.py +160 -0
- xoscar/backends/communication/__init__.py +30 -0
- xoscar/backends/communication/base.py +315 -0
- xoscar/backends/communication/core.py +69 -0
- xoscar/backends/communication/dummy.py +253 -0
- xoscar/backends/communication/errors.py +20 -0
- xoscar/backends/communication/socket.py +444 -0
- xoscar/backends/communication/ucx.py +538 -0
- xoscar/backends/communication/utils.py +97 -0
- xoscar/backends/config.py +157 -0
- xoscar/backends/context.py +437 -0
- xoscar/backends/core.py +352 -0
- xoscar/backends/indigen/__init__.py +16 -0
- xoscar/backends/indigen/__main__.py +19 -0
- xoscar/backends/indigen/backend.py +51 -0
- xoscar/backends/indigen/driver.py +26 -0
- xoscar/backends/indigen/fate_sharing.py +221 -0
- xoscar/backends/indigen/pool.py +515 -0
- xoscar/backends/indigen/shared_memory.py +548 -0
- xoscar/backends/message.cpython-312-darwin.so +0 -0
- xoscar/backends/message.pyi +255 -0
- xoscar/backends/message.pyx +646 -0
- xoscar/backends/pool.py +1630 -0
- xoscar/backends/router.py +285 -0
- xoscar/backends/test/__init__.py +16 -0
- xoscar/backends/test/backend.py +38 -0
- xoscar/backends/test/pool.py +233 -0
- xoscar/batch.py +256 -0
- xoscar/collective/__init__.py +27 -0
- xoscar/collective/backend/__init__.py +13 -0
- xoscar/collective/backend/nccl_backend.py +160 -0
- xoscar/collective/common.py +102 -0
- xoscar/collective/core.py +737 -0
- xoscar/collective/process_group.py +687 -0
- xoscar/collective/utils.py +41 -0
- xoscar/collective/xoscar_pygloo.cpython-312-darwin.so +0 -0
- xoscar/collective/xoscar_pygloo.pyi +239 -0
- xoscar/constants.py +23 -0
- xoscar/context.cpython-312-darwin.so +0 -0
- xoscar/context.pxd +21 -0
- xoscar/context.pyx +368 -0
- xoscar/core.cpython-312-darwin.so +0 -0
- xoscar/core.pxd +51 -0
- xoscar/core.pyx +664 -0
- xoscar/debug.py +188 -0
- xoscar/driver.py +42 -0
- xoscar/errors.py +63 -0
- xoscar/libcpp.pxd +31 -0
- xoscar/metrics/__init__.py +21 -0
- xoscar/metrics/api.py +288 -0
- xoscar/metrics/backends/__init__.py +13 -0
- xoscar/metrics/backends/console/__init__.py +13 -0
- xoscar/metrics/backends/console/console_metric.py +82 -0
- xoscar/metrics/backends/metric.py +149 -0
- xoscar/metrics/backends/prometheus/__init__.py +13 -0
- xoscar/metrics/backends/prometheus/prometheus_metric.py +70 -0
- xoscar/nvutils.py +717 -0
- xoscar/profiling.py +260 -0
- xoscar/serialization/__init__.py +20 -0
- xoscar/serialization/aio.py +141 -0
- xoscar/serialization/core.cpython-312-darwin.so +0 -0
- xoscar/serialization/core.pxd +28 -0
- xoscar/serialization/core.pyi +57 -0
- xoscar/serialization/core.pyx +944 -0
- xoscar/serialization/cuda.py +111 -0
- xoscar/serialization/exception.py +48 -0
- xoscar/serialization/mlx.py +67 -0
- xoscar/serialization/numpy.py +82 -0
- xoscar/serialization/pyfury.py +37 -0
- xoscar/serialization/scipy.py +72 -0
- xoscar/serialization/torch.py +180 -0
- xoscar/utils.py +522 -0
- xoscar/virtualenv/__init__.py +34 -0
- xoscar/virtualenv/core.py +268 -0
- xoscar/virtualenv/platform.py +56 -0
- xoscar/virtualenv/utils.py +100 -0
- xoscar/virtualenv/uv.py +321 -0
- xoscar-0.9.0.dist-info/METADATA +230 -0
- xoscar-0.9.0.dist-info/RECORD +94 -0
- xoscar-0.9.0.dist-info/WHEEL +6 -0
- xoscar-0.9.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
# derived from copyright 1999-2021 Alibaba Group Holding Ltd.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import asyncio.subprocess
|
|
20
|
+
import configparser
|
|
21
|
+
import itertools
|
|
22
|
+
import logging.config
|
|
23
|
+
import os
|
|
24
|
+
import pickle
|
|
25
|
+
import random
|
|
26
|
+
import signal
|
|
27
|
+
import struct
|
|
28
|
+
import sys
|
|
29
|
+
import threading
|
|
30
|
+
import time
|
|
31
|
+
import uuid
|
|
32
|
+
from enum import IntEnum
|
|
33
|
+
from typing import List, Optional
|
|
34
|
+
|
|
35
|
+
import psutil
|
|
36
|
+
|
|
37
|
+
from ..._utils import reset_id_random_seed
|
|
38
|
+
from ...utils import ensure_coverage
|
|
39
|
+
from ..config import ActorPoolConfig
|
|
40
|
+
from ..message import (
|
|
41
|
+
ControlMessage,
|
|
42
|
+
ControlMessageType,
|
|
43
|
+
CreateActorMessage,
|
|
44
|
+
new_message_id,
|
|
45
|
+
)
|
|
46
|
+
from ..pool import MainActorPoolBase, SubActorPoolBase, _register_message_handler
|
|
47
|
+
from . import shared_memory
|
|
48
|
+
from .fate_sharing import create_subprocess_exec
|
|
49
|
+
|
|
50
|
+
_SUBPROCESS_SHM_SIZE = 10240
|
|
51
|
+
_is_windows: bool = sys.platform.startswith("win")
|
|
52
|
+
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class _ShmSeq(IntEnum):
|
|
57
|
+
INIT_PARAMS = 1
|
|
58
|
+
INIT_RESULT = 2
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _shm_put_object(seq: _ShmSeq, shm: shared_memory.SharedMemory, o: object):
|
|
62
|
+
serialized = pickle.dumps(o)
|
|
63
|
+
assert (
|
|
64
|
+
len(serialized) < _SUBPROCESS_SHM_SIZE - 8
|
|
65
|
+
), f"Serialized object {o} is too long."
|
|
66
|
+
shm.buf[4:8] = struct.pack("<I", len(serialized))
|
|
67
|
+
shm.buf[8 : 8 + len(serialized)] = serialized
|
|
68
|
+
shm.buf[:4] = struct.pack("<I", seq)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _shm_get_object(seq: _ShmSeq, shm: shared_memory.SharedMemory):
|
|
72
|
+
recv_seq = struct.unpack("<I", shm.buf[:4])[0]
|
|
73
|
+
if recv_seq != seq:
|
|
74
|
+
return
|
|
75
|
+
size = struct.unpack("<I", shm.buf[4:8])[0]
|
|
76
|
+
return pickle.loads(shm.buf[8 : 8 + size])
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@_register_message_handler
|
|
80
|
+
class MainActorPool(MainActorPoolBase):
|
|
81
|
+
@classmethod
|
|
82
|
+
def get_external_addresses(
|
|
83
|
+
cls,
|
|
84
|
+
address: str,
|
|
85
|
+
n_process: int | None = None,
|
|
86
|
+
ports: list[int] | None = None,
|
|
87
|
+
schemes: list[Optional[str]] | None = None,
|
|
88
|
+
):
|
|
89
|
+
"""Get external address for every process"""
|
|
90
|
+
assert n_process is not None
|
|
91
|
+
if ":" in address:
|
|
92
|
+
host, port_str = address.rsplit(":", 1)
|
|
93
|
+
port = int(port_str)
|
|
94
|
+
if ports:
|
|
95
|
+
if len(ports) != n_process:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"`ports` specified, but its count "
|
|
98
|
+
f"is not equal to `n_process`, "
|
|
99
|
+
f"number of ports: {len(ports)}, "
|
|
100
|
+
f"n_process: {n_process}"
|
|
101
|
+
)
|
|
102
|
+
sub_ports = ports
|
|
103
|
+
else:
|
|
104
|
+
sub_ports = [0] * n_process
|
|
105
|
+
else:
|
|
106
|
+
host = address
|
|
107
|
+
if ports and len(ports) != n_process + 1:
|
|
108
|
+
# ports specified, the first of which should be main port
|
|
109
|
+
raise ValueError(
|
|
110
|
+
f"`ports` specified, but its count "
|
|
111
|
+
f"is not equal to `n_process` + 1, "
|
|
112
|
+
f"number of ports: {len(ports)}, "
|
|
113
|
+
f"n_process + 1: {n_process + 1}"
|
|
114
|
+
)
|
|
115
|
+
elif not ports:
|
|
116
|
+
ports = [0] * (n_process + 1)
|
|
117
|
+
port = ports[0]
|
|
118
|
+
sub_ports = ports[1:]
|
|
119
|
+
if not schemes:
|
|
120
|
+
prefix_iter = itertools.repeat("")
|
|
121
|
+
else:
|
|
122
|
+
prefix_iter = [f"{scheme}://" if scheme else "" for scheme in schemes] # type: ignore
|
|
123
|
+
return [
|
|
124
|
+
f"{prefix}{host}:{port}"
|
|
125
|
+
for port, prefix in zip([port] + sub_ports, prefix_iter)
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
@classmethod
|
|
129
|
+
def gen_internal_address(
|
|
130
|
+
cls, process_index: int, external_address: str | None = None
|
|
131
|
+
) -> str | None:
|
|
132
|
+
if hasattr(asyncio, "start_unix_server"):
|
|
133
|
+
return f"unixsocket:///{process_index}"
|
|
134
|
+
else:
|
|
135
|
+
return external_address
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
async def start_sub_pool(
|
|
139
|
+
cls,
|
|
140
|
+
actor_pool_config: ActorPoolConfig,
|
|
141
|
+
process_index: int,
|
|
142
|
+
start_python: str | None = None,
|
|
143
|
+
):
|
|
144
|
+
return await cls._create_sub_pool_from_parent(
|
|
145
|
+
actor_pool_config, process_index, start_python
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
|
|
150
|
+
processes: list[asyncio.subprocess.Process] = []
|
|
151
|
+
ext_addresses = []
|
|
152
|
+
error = None
|
|
153
|
+
for task in create_pool_tasks:
|
|
154
|
+
process, address = await task
|
|
155
|
+
processes.append(process)
|
|
156
|
+
ext_addresses.append(address)
|
|
157
|
+
if error:
|
|
158
|
+
for p in processes:
|
|
159
|
+
# error happens, kill all subprocesses
|
|
160
|
+
p.kill()
|
|
161
|
+
raise error
|
|
162
|
+
return processes, ext_addresses
|
|
163
|
+
|
|
164
|
+
@classmethod
|
|
165
|
+
def _start_sub_pool_in_child(
|
|
166
|
+
cls,
|
|
167
|
+
shm_name: str,
|
|
168
|
+
):
|
|
169
|
+
ensure_coverage()
|
|
170
|
+
|
|
171
|
+
shm = shared_memory.SharedMemory(shm_name, track=False)
|
|
172
|
+
try:
|
|
173
|
+
config = _shm_get_object(_ShmSeq.INIT_PARAMS, shm)
|
|
174
|
+
# Check Python version once.
|
|
175
|
+
sub_pool_python_version = config.pop("python_version", None)
|
|
176
|
+
if (
|
|
177
|
+
sub_pool_python_version is not None
|
|
178
|
+
and sub_pool_python_version != sys.hexversion
|
|
179
|
+
):
|
|
180
|
+
logger.warning(
|
|
181
|
+
f"The sub pool is using a different Python version, you may encounter serialization issues."
|
|
182
|
+
f" sub pool: {sub_pool_python_version}, main pool: {sys.hexversion}"
|
|
183
|
+
)
|
|
184
|
+
actor_config = config["actor_pool_config"]
|
|
185
|
+
process_index = config["process_index"]
|
|
186
|
+
main_pool_pid = config["main_pool_pid"]
|
|
187
|
+
|
|
188
|
+
def _check_ppid():
|
|
189
|
+
while True:
|
|
190
|
+
try:
|
|
191
|
+
# We can't simply check if the os.getppid() equals with main_pool_pid,
|
|
192
|
+
# as the double fork may result in a new process as the parent.
|
|
193
|
+
psutil.Process(main_pool_pid)
|
|
194
|
+
except psutil.NoSuchProcess:
|
|
195
|
+
logger.error("Exit due to main pool %s exit.", main_pool_pid)
|
|
196
|
+
os._exit(233) # Special exit code for debugging.
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.exception("Check ppid failed: %s", e)
|
|
199
|
+
time.sleep(10)
|
|
200
|
+
|
|
201
|
+
t = threading.Thread(target=_check_ppid, daemon=True)
|
|
202
|
+
t.start()
|
|
203
|
+
|
|
204
|
+
# make sure enough randomness for every sub pool
|
|
205
|
+
random.seed(uuid.uuid1().bytes)
|
|
206
|
+
reset_id_random_seed()
|
|
207
|
+
|
|
208
|
+
conf = actor_config.get_pool_config(process_index)
|
|
209
|
+
suspend_sigint = conf["suspend_sigint"]
|
|
210
|
+
if suspend_sigint:
|
|
211
|
+
signal.signal(signal.SIGINT, lambda *_: None)
|
|
212
|
+
|
|
213
|
+
logging_conf = conf["logging_conf"] or {}
|
|
214
|
+
if isinstance(logging_conf, configparser.RawConfigParser):
|
|
215
|
+
logging.config.fileConfig(logging_conf)
|
|
216
|
+
elif logging_conf.get("dict"):
|
|
217
|
+
logging.config.dictConfig(logging_conf["dict"])
|
|
218
|
+
elif logging_conf.get("file"):
|
|
219
|
+
logging.config.fileConfig(logging_conf["file"])
|
|
220
|
+
elif logging_conf.get("level"):
|
|
221
|
+
logging.getLogger("__main__").setLevel(logging_conf["level"])
|
|
222
|
+
logging.getLogger("xoscar").setLevel(logging_conf["level"])
|
|
223
|
+
if logging_conf.get("format"):
|
|
224
|
+
logging.basicConfig(format=logging_conf["format"])
|
|
225
|
+
|
|
226
|
+
use_uvloop = conf["use_uvloop"]
|
|
227
|
+
if use_uvloop:
|
|
228
|
+
import uvloop
|
|
229
|
+
|
|
230
|
+
asyncio.set_event_loop(uvloop.new_event_loop())
|
|
231
|
+
else:
|
|
232
|
+
asyncio.set_event_loop(asyncio.new_event_loop())
|
|
233
|
+
|
|
234
|
+
coro = cls._create_sub_pool(actor_config, process_index, main_pool_pid, shm)
|
|
235
|
+
asyncio.run(coro)
|
|
236
|
+
finally:
|
|
237
|
+
shm.close()
|
|
238
|
+
|
|
239
|
+
@classmethod
|
|
240
|
+
async def _create_sub_pool(
|
|
241
|
+
cls,
|
|
242
|
+
actor_config: ActorPoolConfig,
|
|
243
|
+
process_index: int,
|
|
244
|
+
main_pool_pid: int,
|
|
245
|
+
shm: shared_memory.SharedMemory,
|
|
246
|
+
):
|
|
247
|
+
cur_pool_config = actor_config.get_pool_config(process_index)
|
|
248
|
+
env = cur_pool_config["env"]
|
|
249
|
+
if env:
|
|
250
|
+
os.environ.update(env)
|
|
251
|
+
pool = await SubActorPool.create(
|
|
252
|
+
{
|
|
253
|
+
"actor_pool_config": actor_config,
|
|
254
|
+
"process_index": process_index,
|
|
255
|
+
"main_pool_pid": main_pool_pid,
|
|
256
|
+
}
|
|
257
|
+
)
|
|
258
|
+
await pool.start()
|
|
259
|
+
_shm_put_object(_ShmSeq.INIT_RESULT, shm, cur_pool_config["external_address"])
|
|
260
|
+
await pool.join()
|
|
261
|
+
|
|
262
|
+
@staticmethod
|
|
263
|
+
async def _create_sub_pool_from_parent(
|
|
264
|
+
actor_pool_config: ActorPoolConfig,
|
|
265
|
+
process_index: int,
|
|
266
|
+
start_python: str | None = None,
|
|
267
|
+
):
|
|
268
|
+
# We check the Python version in _shm_get_object to make it faster,
|
|
269
|
+
# as in most cases the Python versions are the same.
|
|
270
|
+
if start_python is None:
|
|
271
|
+
start_python = sys.executable
|
|
272
|
+
|
|
273
|
+
external_addresses: List | None = None
|
|
274
|
+
shm = shared_memory.SharedMemory(
|
|
275
|
+
create=True, size=_SUBPROCESS_SHM_SIZE, track=False
|
|
276
|
+
)
|
|
277
|
+
try:
|
|
278
|
+
_shm_put_object(
|
|
279
|
+
_ShmSeq.INIT_PARAMS,
|
|
280
|
+
shm,
|
|
281
|
+
{
|
|
282
|
+
"actor_pool_config": actor_pool_config,
|
|
283
|
+
"process_index": process_index,
|
|
284
|
+
"main_pool_pid": os.getpid(),
|
|
285
|
+
"python_version": sys.hexversion,
|
|
286
|
+
},
|
|
287
|
+
)
|
|
288
|
+
cmd = [
|
|
289
|
+
start_python,
|
|
290
|
+
"-m",
|
|
291
|
+
"xoscar.backends.indigen",
|
|
292
|
+
"start_sub_pool",
|
|
293
|
+
"-sn",
|
|
294
|
+
shm.name,
|
|
295
|
+
]
|
|
296
|
+
# We need to inherit the parent environment to ensure the subprocess works correctly on Windows.
|
|
297
|
+
new_env = dict(os.environ)
|
|
298
|
+
env = actor_pool_config.get_pool_config(process_index).get("env") or {}
|
|
299
|
+
new_env.update(env)
|
|
300
|
+
if os.getenv("XOSCAR_CPU_AFFINITY") == "1":
|
|
301
|
+
import multiprocessing
|
|
302
|
+
|
|
303
|
+
total_cores = multiprocessing.cpu_count()
|
|
304
|
+
all_cores_range = f"0-{total_cores - 1}"
|
|
305
|
+
cmd = ["taskset", "-c", all_cores_range] + cmd
|
|
306
|
+
logger.info("Creating sub pool via command: %s", cmd)
|
|
307
|
+
process = await create_subprocess_exec(*cmd, env=new_env)
|
|
308
|
+
|
|
309
|
+
def _get_external_addresses():
|
|
310
|
+
try:
|
|
311
|
+
nonlocal external_addresses
|
|
312
|
+
while (
|
|
313
|
+
shm
|
|
314
|
+
and shm.buf is not None
|
|
315
|
+
and not (
|
|
316
|
+
external_addresses := _shm_get_object(
|
|
317
|
+
_ShmSeq.INIT_RESULT, shm
|
|
318
|
+
)
|
|
319
|
+
)
|
|
320
|
+
):
|
|
321
|
+
time.sleep(0.1)
|
|
322
|
+
except asyncio.CancelledError:
|
|
323
|
+
pass
|
|
324
|
+
|
|
325
|
+
_, unfinished = await asyncio.wait(
|
|
326
|
+
[
|
|
327
|
+
asyncio.create_task(process.wait()),
|
|
328
|
+
asyncio.create_task(asyncio.to_thread(_get_external_addresses)),
|
|
329
|
+
],
|
|
330
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
331
|
+
)
|
|
332
|
+
for t in unfinished:
|
|
333
|
+
t.cancel()
|
|
334
|
+
finally:
|
|
335
|
+
shm.close()
|
|
336
|
+
shm.unlink()
|
|
337
|
+
if external_addresses is None:
|
|
338
|
+
raise OSError(f"Start sub pool failed, returncode: {process.returncode}")
|
|
339
|
+
return process, external_addresses
|
|
340
|
+
|
|
341
|
+
async def append_sub_pool(
|
|
342
|
+
self,
|
|
343
|
+
label: str | None = None,
|
|
344
|
+
internal_address: str | None = None,
|
|
345
|
+
external_address: str | None = None,
|
|
346
|
+
env: dict | None = None,
|
|
347
|
+
modules: list[str] | None = None,
|
|
348
|
+
suspend_sigint: bool | None = None,
|
|
349
|
+
use_uvloop: bool | None = None,
|
|
350
|
+
logging_conf: dict | None = None,
|
|
351
|
+
start_python: str | None = None,
|
|
352
|
+
kwargs: dict | None = None,
|
|
353
|
+
):
|
|
354
|
+
# external_address has port 0, subprocess will bind random port.
|
|
355
|
+
external_address = (
|
|
356
|
+
external_address
|
|
357
|
+
or MainActorPool.get_external_addresses(self.external_address, n_process=1)[
|
|
358
|
+
-1
|
|
359
|
+
]
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# use last process index's logging_conf and use_uv_loop config if not provide
|
|
363
|
+
actor_pool_config = self._config.as_dict()
|
|
364
|
+
last_process_index = self._config.get_process_indexes()[-1]
|
|
365
|
+
last_logging_conf = actor_pool_config["pools"][last_process_index][
|
|
366
|
+
"logging_conf"
|
|
367
|
+
]
|
|
368
|
+
last_use_uv_loop = actor_pool_config["pools"][last_process_index]["use_uvloop"]
|
|
369
|
+
_logging_conf = logging_conf or last_logging_conf
|
|
370
|
+
_use_uv_loop = use_uvloop if use_uvloop is not None else last_use_uv_loop
|
|
371
|
+
|
|
372
|
+
process_index = next(MainActorPool.process_index_gen(external_address))
|
|
373
|
+
internal_address = internal_address or MainActorPool.gen_internal_address(
|
|
374
|
+
process_index, external_address
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
self._config.add_pool_conf(
|
|
378
|
+
process_index,
|
|
379
|
+
label,
|
|
380
|
+
internal_address,
|
|
381
|
+
external_address,
|
|
382
|
+
env,
|
|
383
|
+
modules,
|
|
384
|
+
suspend_sigint,
|
|
385
|
+
_use_uv_loop,
|
|
386
|
+
_logging_conf,
|
|
387
|
+
kwargs,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
process, external_addresses = await self._create_sub_pool_from_parent(
|
|
391
|
+
self._config, process_index, start_python
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
self._config.reset_pool_external_address(process_index, external_addresses[0])
|
|
395
|
+
self.attach_sub_process(external_addresses[0], process)
|
|
396
|
+
|
|
397
|
+
control_message = ControlMessage(
|
|
398
|
+
message_id=new_message_id(),
|
|
399
|
+
address=self.external_address,
|
|
400
|
+
control_message_type=ControlMessageType.sync_config,
|
|
401
|
+
content=self._config,
|
|
402
|
+
)
|
|
403
|
+
await self.handle_control_command(control_message)
|
|
404
|
+
# The actual port will return in process_status.
|
|
405
|
+
return external_addresses[0]
|
|
406
|
+
|
|
407
|
+
async def remove_sub_pool(
|
|
408
|
+
self, external_address: str, timeout: float | None = None, force: bool = False
|
|
409
|
+
):
|
|
410
|
+
process = self.sub_processes[external_address]
|
|
411
|
+
process_index = self._config.get_process_index(external_address)
|
|
412
|
+
del self.sub_processes[external_address]
|
|
413
|
+
self._config.remove_pool_config(process_index)
|
|
414
|
+
await self.stop_sub_pool(external_address, process, timeout, force)
|
|
415
|
+
|
|
416
|
+
control_message = ControlMessage(
|
|
417
|
+
message_id=new_message_id(),
|
|
418
|
+
address=self.external_address,
|
|
419
|
+
control_message_type=ControlMessageType.sync_config,
|
|
420
|
+
content=self._config,
|
|
421
|
+
)
|
|
422
|
+
await self.handle_control_command(control_message)
|
|
423
|
+
|
|
424
|
+
async def kill_sub_pool(
|
|
425
|
+
self, process: asyncio.subprocess.Process, force: bool = False
|
|
426
|
+
):
|
|
427
|
+
# First, try to terminate the process gracefully
|
|
428
|
+
if not force:
|
|
429
|
+
try:
|
|
430
|
+
process.terminate()
|
|
431
|
+
# Wait for graceful termination
|
|
432
|
+
try:
|
|
433
|
+
await asyncio.wait_for(process.wait(), timeout=2.0)
|
|
434
|
+
except asyncio.TimeoutError:
|
|
435
|
+
# Process didn't terminate gracefully, force kill
|
|
436
|
+
force = True
|
|
437
|
+
except ProcessLookupError:
|
|
438
|
+
# Process already terminated
|
|
439
|
+
pass
|
|
440
|
+
|
|
441
|
+
# Force kill if needed or if graceful termination failed
|
|
442
|
+
if force:
|
|
443
|
+
try:
|
|
444
|
+
process.kill()
|
|
445
|
+
except ProcessLookupError:
|
|
446
|
+
# Process already dead
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
# Ensure process is completely terminated and cleaned up
|
|
450
|
+
try:
|
|
451
|
+
# Wait for process to complete
|
|
452
|
+
if process.returncode is None:
|
|
453
|
+
try:
|
|
454
|
+
await asyncio.wait_for(process.wait(), timeout=5.0)
|
|
455
|
+
except asyncio.TimeoutError:
|
|
456
|
+
pass
|
|
457
|
+
except ProcessLookupError:
|
|
458
|
+
# Process already terminated
|
|
459
|
+
pass
|
|
460
|
+
|
|
461
|
+
# Python 3.13 specific cleanup for waitpid threads
|
|
462
|
+
if sys.version_info >= (3, 13):
|
|
463
|
+
try:
|
|
464
|
+
# Close the transport to clean up waitpid thread
|
|
465
|
+
if hasattr(process, "_transport") and process._transport:
|
|
466
|
+
process._transport.close()
|
|
467
|
+
# Also try to close the pipe transport if it exists
|
|
468
|
+
if hasattr(process, "_pipes") and process._pipes:
|
|
469
|
+
for pipe in process._pipes.values():
|
|
470
|
+
if hasattr(pipe, "close"):
|
|
471
|
+
pipe.close()
|
|
472
|
+
except Exception:
|
|
473
|
+
# Ignore errors during cleanup
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
# Additional cleanup using psutil to ensure process tree is terminated
|
|
477
|
+
try:
|
|
478
|
+
p = psutil.Process(process.pid)
|
|
479
|
+
if p.is_running():
|
|
480
|
+
# Kill the entire process tree
|
|
481
|
+
for child in p.children(recursive=True):
|
|
482
|
+
try:
|
|
483
|
+
child.kill()
|
|
484
|
+
except psutil.NoSuchProcess:
|
|
485
|
+
pass
|
|
486
|
+
p.kill()
|
|
487
|
+
p.wait(timeout=2.0)
|
|
488
|
+
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
|
489
|
+
# Process already dead or couldn't be killed
|
|
490
|
+
pass
|
|
491
|
+
|
|
492
|
+
async def is_sub_pool_alive(self, process: asyncio.subprocess.Process):
|
|
493
|
+
return process.returncode is None
|
|
494
|
+
|
|
495
|
+
async def recover_sub_pool(self, address: str):
|
|
496
|
+
process_index = self._config.get_process_index(address)
|
|
497
|
+
# process dead, restart it
|
|
498
|
+
# remember always use spawn to recover sub pool
|
|
499
|
+
task = asyncio.create_task(self.start_sub_pool(self._config, process_index))
|
|
500
|
+
self.sub_processes[address] = (await self.wait_sub_pools_ready([task]))[0][0]
|
|
501
|
+
|
|
502
|
+
if self._auto_recover == "actor":
|
|
503
|
+
# need to recover all created actors
|
|
504
|
+
for _, message in self._allocated_actors[address].values():
|
|
505
|
+
create_actor_message: CreateActorMessage = message # type: ignore
|
|
506
|
+
await self.call(address, create_actor_message)
|
|
507
|
+
|
|
508
|
+
async def start(self):
|
|
509
|
+
await super().start()
|
|
510
|
+
await self.start_monitor()
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
@_register_message_handler
|
|
514
|
+
class SubActorPool(SubActorPoolBase):
|
|
515
|
+
pass
|