xoscar 0.9.0__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. xoscar/__init__.py +61 -0
  2. xoscar/_utils.cpython-312-darwin.so +0 -0
  3. xoscar/_utils.pxd +36 -0
  4. xoscar/_utils.pyx +246 -0
  5. xoscar/_version.py +693 -0
  6. xoscar/aio/__init__.py +16 -0
  7. xoscar/aio/base.py +86 -0
  8. xoscar/aio/file.py +59 -0
  9. xoscar/aio/lru.py +228 -0
  10. xoscar/aio/parallelism.py +39 -0
  11. xoscar/api.py +527 -0
  12. xoscar/backend.py +67 -0
  13. xoscar/backends/__init__.py +14 -0
  14. xoscar/backends/allocate_strategy.py +160 -0
  15. xoscar/backends/communication/__init__.py +30 -0
  16. xoscar/backends/communication/base.py +315 -0
  17. xoscar/backends/communication/core.py +69 -0
  18. xoscar/backends/communication/dummy.py +253 -0
  19. xoscar/backends/communication/errors.py +20 -0
  20. xoscar/backends/communication/socket.py +444 -0
  21. xoscar/backends/communication/ucx.py +538 -0
  22. xoscar/backends/communication/utils.py +97 -0
  23. xoscar/backends/config.py +157 -0
  24. xoscar/backends/context.py +437 -0
  25. xoscar/backends/core.py +352 -0
  26. xoscar/backends/indigen/__init__.py +16 -0
  27. xoscar/backends/indigen/__main__.py +19 -0
  28. xoscar/backends/indigen/backend.py +51 -0
  29. xoscar/backends/indigen/driver.py +26 -0
  30. xoscar/backends/indigen/fate_sharing.py +221 -0
  31. xoscar/backends/indigen/pool.py +515 -0
  32. xoscar/backends/indigen/shared_memory.py +548 -0
  33. xoscar/backends/message.cpython-312-darwin.so +0 -0
  34. xoscar/backends/message.pyi +255 -0
  35. xoscar/backends/message.pyx +646 -0
  36. xoscar/backends/pool.py +1630 -0
  37. xoscar/backends/router.py +285 -0
  38. xoscar/backends/test/__init__.py +16 -0
  39. xoscar/backends/test/backend.py +38 -0
  40. xoscar/backends/test/pool.py +233 -0
  41. xoscar/batch.py +256 -0
  42. xoscar/collective/__init__.py +27 -0
  43. xoscar/collective/backend/__init__.py +13 -0
  44. xoscar/collective/backend/nccl_backend.py +160 -0
  45. xoscar/collective/common.py +102 -0
  46. xoscar/collective/core.py +737 -0
  47. xoscar/collective/process_group.py +687 -0
  48. xoscar/collective/utils.py +41 -0
  49. xoscar/collective/xoscar_pygloo.cpython-312-darwin.so +0 -0
  50. xoscar/collective/xoscar_pygloo.pyi +239 -0
  51. xoscar/constants.py +23 -0
  52. xoscar/context.cpython-312-darwin.so +0 -0
  53. xoscar/context.pxd +21 -0
  54. xoscar/context.pyx +368 -0
  55. xoscar/core.cpython-312-darwin.so +0 -0
  56. xoscar/core.pxd +51 -0
  57. xoscar/core.pyx +664 -0
  58. xoscar/debug.py +188 -0
  59. xoscar/driver.py +42 -0
  60. xoscar/errors.py +63 -0
  61. xoscar/libcpp.pxd +31 -0
  62. xoscar/metrics/__init__.py +21 -0
  63. xoscar/metrics/api.py +288 -0
  64. xoscar/metrics/backends/__init__.py +13 -0
  65. xoscar/metrics/backends/console/__init__.py +13 -0
  66. xoscar/metrics/backends/console/console_metric.py +82 -0
  67. xoscar/metrics/backends/metric.py +149 -0
  68. xoscar/metrics/backends/prometheus/__init__.py +13 -0
  69. xoscar/metrics/backends/prometheus/prometheus_metric.py +70 -0
  70. xoscar/nvutils.py +717 -0
  71. xoscar/profiling.py +260 -0
  72. xoscar/serialization/__init__.py +20 -0
  73. xoscar/serialization/aio.py +141 -0
  74. xoscar/serialization/core.cpython-312-darwin.so +0 -0
  75. xoscar/serialization/core.pxd +28 -0
  76. xoscar/serialization/core.pyi +57 -0
  77. xoscar/serialization/core.pyx +944 -0
  78. xoscar/serialization/cuda.py +111 -0
  79. xoscar/serialization/exception.py +48 -0
  80. xoscar/serialization/mlx.py +67 -0
  81. xoscar/serialization/numpy.py +82 -0
  82. xoscar/serialization/pyfury.py +37 -0
  83. xoscar/serialization/scipy.py +72 -0
  84. xoscar/serialization/torch.py +180 -0
  85. xoscar/utils.py +522 -0
  86. xoscar/virtualenv/__init__.py +34 -0
  87. xoscar/virtualenv/core.py +268 -0
  88. xoscar/virtualenv/platform.py +56 -0
  89. xoscar/virtualenv/utils.py +100 -0
  90. xoscar/virtualenv/uv.py +321 -0
  91. xoscar-0.9.0.dist-info/METADATA +230 -0
  92. xoscar-0.9.0.dist-info/RECORD +94 -0
  93. xoscar-0.9.0.dist-info/WHEEL +6 -0
  94. xoscar-0.9.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,737 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import hashlib
15
+ import os
16
+ from collections import defaultdict
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ from .. import Actor, actor_ref
20
+ from ..context import get_context
21
+ from ..utils import lazy_import
22
+ from .common import (
23
+ COLLECTIVE_DEVICE_ID_ENV_KEY,
24
+ INVOKE_ERROR_MESSAGE,
25
+ RANK_ADDRESS_ENV_KEY,
26
+ AllReduceAlgorithm,
27
+ CollectiveReduceOp,
28
+ )
29
+ from .process_group import ProcessGroup, ProcessGroupGloo, ProcessGroupNCCL
30
+ from .utils import get_rank_address_via_env
31
+
32
+ cupy = lazy_import("cupy")
33
+
34
+
35
+ class RankActor(Actor):
36
+ def __init__(
37
+ self,
38
+ rank: int,
39
+ world: int,
40
+ backend: str = "gloo",
41
+ device_id: Optional[int] = None,
42
+ pg_options: Optional[ProcessGroup.Options] = None,
43
+ ):
44
+ assert backend == "gloo" or (
45
+ backend == "nccl" and cupy is not None
46
+ ), "cupy is required when using nccl as backend."
47
+ self._rank = rank
48
+ self._device_id = device_id
49
+ self._world = world
50
+ self._backend = backend
51
+ self.name_to_pg: Dict[str, Dict[str, "ProcessGroup"]] = defaultdict(dict)
52
+ self._pg_options = pg_options
53
+
54
+ @classmethod
55
+ def default_uid(cls):
56
+ return "RankActor"
57
+
58
+ async def __post_create__(self):
59
+ os.environ[RANK_ADDRESS_ENV_KEY] = self.address
60
+ _ip = self._get_ip()
61
+ if self._backend == "gloo":
62
+ pg = ProcessGroupGloo(
63
+ _ip,
64
+ self._rank,
65
+ self._world,
66
+ group_name="default",
67
+ pg_options=self._pg_options,
68
+ )
69
+ self.name_to_pg["gloo"]["default"] = pg
70
+ elif self._backend == "nccl":
71
+ pg = ProcessGroupNCCL(
72
+ _ip,
73
+ self._rank,
74
+ self._device_id,
75
+ self._world,
76
+ pg_options=self._pg_options,
77
+ )
78
+ self.name_to_pg["nccl"]["default"] = pg
79
+ else:
80
+ raise NotImplementedError("Not impl other backends for now!")
81
+
82
+ def process_group(self, pg_name: str) -> ProcessGroup:
83
+ return self.name_to_pg[self._backend][pg_name]
84
+
85
+ def rank(self) -> int:
86
+ return self._rank
87
+
88
+ def world(self) -> int:
89
+ return self._world
90
+
91
+ def device_id(self):
92
+ return self._device_id
93
+
94
+ def backend(self) -> str:
95
+ return self._backend
96
+
97
+ def _get_ip(self) -> str:
98
+ return self.address.rsplit(":", 1)[0]
99
+
100
+ def _process_group_name(self, ranks: List[int]) -> str:
101
+ return hashlib.sha1(
102
+ bytes(self._backend + "_".join(map(str, ranks)), "utf-8")
103
+ ).hexdigest()
104
+
105
+ def new_group(
106
+ self,
107
+ ranks: List[int],
108
+ pg_options: Optional[ProcessGroup.Options] = None,
109
+ ) -> Optional[str]:
110
+ assert (
111
+ len(ranks) <= self._world
112
+ ), "``ranks`` in new_group cannot be larger than the world."
113
+ assert all(
114
+ [self._world > rank >= 0 for rank in ranks]
115
+ ), "rank in ``ranks`` is illegal."
116
+ assert len({rank for rank in ranks}) == len(
117
+ ranks
118
+ ), "there can be no duplicate ranks in the ``ranks``."
119
+ if self._rank not in ranks:
120
+ return None
121
+ if len(ranks) == self._world:
122
+ return "default"
123
+ global_ranks = sorted(ranks)
124
+ group_rank = global_ranks.index(self._rank)
125
+ group_world = len(global_ranks)
126
+ group_name = self._process_group_name(global_ranks)
127
+ device_id = self._device_id
128
+ if group_name in self.name_to_pg[self._backend]:
129
+ return group_name
130
+ _ip = self._get_ip()
131
+ if self._backend == "gloo":
132
+ pg_gloo = ProcessGroupGloo(
133
+ _ip,
134
+ group_rank,
135
+ group_world,
136
+ group_name=group_name,
137
+ pg_options=pg_options,
138
+ )
139
+ self.name_to_pg[self._backend][group_name] = pg_gloo
140
+ elif self._backend == "nccl":
141
+ pg_nccl = ProcessGroupNCCL(
142
+ _ip,
143
+ group_rank,
144
+ device_id, # type: ignore
145
+ group_world,
146
+ group_name=group_name,
147
+ pg_options=pg_options,
148
+ )
149
+ self.name_to_pg[self._backend][group_name] = pg_nccl
150
+ else:
151
+ raise NotImplementedError("Not impl other backends for now!")
152
+ return group_name
153
+
154
+ def reduce(
155
+ self,
156
+ send_data: Any,
157
+ recv_data: Any,
158
+ op: CollectiveReduceOp = CollectiveReduceOp.SUM,
159
+ root: Optional[int] = 0,
160
+ tag: Optional[int] = 0,
161
+ pg_name: str = "default",
162
+ stream: Optional[Any] = None,
163
+ ):
164
+ assert self.backend() == "nccl" or (
165
+ self.backend() == "gloo" and stream is None
166
+ ), "The parameter 'stream' can only be used when the backend of the group is 'nccl'"
167
+
168
+ if self._backend == "gloo":
169
+ self.name_to_pg[self._backend][pg_name].reduce(
170
+ send_data, recv_data, op=op, root=root, tag=tag
171
+ )
172
+ else:
173
+ self.name_to_pg[self._backend][pg_name].reduce(
174
+ send_data,
175
+ recv_data,
176
+ op=op,
177
+ root=root,
178
+ stream=stream,
179
+ )
180
+
181
+ def allreduce(
182
+ self,
183
+ send_data: Any,
184
+ recv_data: Any,
185
+ op: CollectiveReduceOp = CollectiveReduceOp.SUM,
186
+ algorithm: AllReduceAlgorithm = AllReduceAlgorithm.RING,
187
+ tag: Optional[int] = 0,
188
+ pg_name: str = "default",
189
+ stream: Optional[Any] = None,
190
+ ):
191
+ if self._backend == "gloo":
192
+ self.name_to_pg[self._backend][pg_name].allreduce(
193
+ send_data, recv_data, op=op, algorithm=algorithm, tag=tag
194
+ )
195
+ else:
196
+ self.name_to_pg[self._backend][pg_name].allreduce(
197
+ send_data, recv_data, op=op, stream=stream
198
+ )
199
+
200
+ def gather(
201
+ self,
202
+ send_data: Any,
203
+ recv_data: Any,
204
+ root: Optional[int] = 0,
205
+ tag: Optional[int] = 0,
206
+ pg_name: str = "default",
207
+ stream: Optional[Any] = None,
208
+ ):
209
+ assert self.backend() == "nccl" or (
210
+ self.backend() == "gloo" and stream is None
211
+ ), "The parameter 'stream' can only be used when the backend of the group is 'nccl'"
212
+
213
+ if self._backend == "gloo":
214
+ self.name_to_pg[self._backend][pg_name].gather(
215
+ send_data, recv_data, root=root, tag=tag
216
+ )
217
+ else:
218
+ self.name_to_pg[self._backend][pg_name].gather(
219
+ send_data, recv_data, root=root, stream=stream
220
+ )
221
+
222
+ def allgather(
223
+ self,
224
+ send_data: Any,
225
+ recv_data: Any,
226
+ tag: Optional[int] = 0,
227
+ pg_name: str = "default",
228
+ stream: Optional[Any] = None,
229
+ ):
230
+ if self._backend == "gloo":
231
+ self.name_to_pg[self._backend][pg_name].allgather(
232
+ send_data, recv_data, tag=tag
233
+ )
234
+ else:
235
+ self.name_to_pg[self._backend][pg_name].allgather(
236
+ send_data, recv_data, stream=stream
237
+ )
238
+
239
+ def scatter(
240
+ self,
241
+ send_data: List[Any],
242
+ recv_data: Any,
243
+ root: Optional[int] = 0,
244
+ tag: Optional[int] = 0,
245
+ pg_name: str = "default",
246
+ stream: Optional[Any] = None,
247
+ ):
248
+ assert self.backend() == "nccl" or (
249
+ self.backend() == "gloo" and stream is None
250
+ ), "The parameter 'stream' can only be used when the backend of the group is 'nccl'"
251
+
252
+ if self._backend == "gloo":
253
+ self.name_to_pg[self._backend][pg_name].scatter(
254
+ send_data, recv_data, root=root, tag=tag
255
+ )
256
+ else:
257
+ self.name_to_pg[self._backend][pg_name].scatter(
258
+ send_data, recv_data, root=root, stream=stream
259
+ )
260
+
261
+ def reduce_scatter(
262
+ self,
263
+ send_data: Any,
264
+ recv_data: Any,
265
+ recv_elems: List[int],
266
+ op: CollectiveReduceOp = CollectiveReduceOp.SUM,
267
+ pg_name: str = "default",
268
+ stream: Optional[Any] = None,
269
+ ):
270
+ assert self.backend() == "nccl" or (
271
+ self.backend() == "gloo" and stream is None
272
+ ), "The parameter 'stream' can only be used when the backend of the group is 'nccl'"
273
+
274
+ if self._backend == "gloo":
275
+ self.name_to_pg[self._backend][pg_name].reduce_scatter(
276
+ send_data, recv_data, recv_elems, op
277
+ )
278
+ else:
279
+ self.name_to_pg[self._backend][pg_name].reduce_scatter(
280
+ send_data, recv_data, recv_elems, op, stream=stream
281
+ )
282
+
283
+ def alltoall(
284
+ self,
285
+ send_data: Any,
286
+ recv_data: Any,
287
+ tag: Optional[int] = 0,
288
+ pg_name: str = "default",
289
+ stream: Optional[Any] = None,
290
+ ):
291
+ assert self.backend() == "nccl" or (
292
+ self.backend() == "gloo" and stream is None
293
+ ), "The parameter 'stream' can only be used when the backend of the group is 'nccl'"
294
+
295
+ if self._backend == "gloo":
296
+ self.name_to_pg[self._backend][pg_name].alltoall(
297
+ send_data, recv_data, tag=tag
298
+ )
299
+ else:
300
+ self.name_to_pg[self._backend][pg_name].alltoall(
301
+ send_data, recv_data, stream=stream
302
+ )
303
+
304
+ def broadcast(
305
+ self,
306
+ send_data: Any,
307
+ recv_data: Any,
308
+ root: Optional[int] = 0,
309
+ tag: Optional[int] = 0,
310
+ pg_name: str = "default",
311
+ stream: Optional[Any] = None,
312
+ ):
313
+ assert self.backend() == "nccl" or (
314
+ self.backend() == "gloo" and stream is None
315
+ ), "The parameter 'stream' can only be used when the backend of the group is 'nccl'"
316
+
317
+ if self._backend == "gloo":
318
+ self.name_to_pg[self._backend][pg_name].broadcast(
319
+ send_data, recv_data, root, tag=tag
320
+ )
321
+ else:
322
+ self.name_to_pg[self._backend][pg_name].broadcast(
323
+ send_data, recv_data, root, stream=stream
324
+ )
325
+
326
+
327
+ async def init_process_group(
328
+ rank: int,
329
+ world_size: int,
330
+ backend: str = "gloo",
331
+ device_id: Optional[int] = None,
332
+ address: Optional[str] = None,
333
+ ):
334
+ """
335
+ Initializes the default distributed process group, and this will also
336
+ initialize the distributed package.
337
+
338
+ Args:
339
+ rank (int): Rank of the current process (it should be a
340
+ number between 0 and ``world_size``-1).
341
+
342
+ world_size (int): Number of processes participating in
343
+ the job.
344
+
345
+ backend (str optional): The backend to use. Depending on
346
+ build-time configurations, valid values include ``gloo`` and
347
+ ``nccl``. If the backend is not provided, then a ``gloo`` backend
348
+ will be created.
349
+
350
+ device_id(int, optional): GPU ID the actor will bind, default ``None``
351
+ If it is None and backend is gloo, it will try to get it from the environment variable COLLECTIVE_DEVICE_ID_ENV_KEY.
352
+ If the environment variable is not set either, it will return an error.
353
+
354
+ address(str, optional): actor address. default ``None``
355
+ """
356
+ env_device_id = os.environ.get(COLLECTIVE_DEVICE_ID_ENV_KEY, None)
357
+ assert backend == "gloo" or (
358
+ backend == "nccl"
359
+ and (
360
+ device_id is not None
361
+ and device_id >= 0
362
+ or env_device_id is not None
363
+ and int(env_device_id) >= 0
364
+ )
365
+ ), "The device id should be set when using nccl as backend."
366
+ assert backend == "gloo" or (
367
+ backend == "nccl" and cupy is not None
368
+ ), "cupy is required when using nccl as backend."
369
+ address = address or os.environ.get(RANK_ADDRESS_ENV_KEY, None)
370
+ if address is None:
371
+ raise RuntimeError(
372
+ "Cannot decide which process to involve in the collective communication."
373
+ )
374
+ ctx = get_context()
375
+ if backend == "nccl" and device_id is None and env_device_id is not None:
376
+ device_id = int(env_device_id)
377
+ await ctx.create_actor(
378
+ RankActor,
379
+ rank,
380
+ world_size,
381
+ backend=backend,
382
+ device_id=device_id,
383
+ address=address,
384
+ uid="RankActor",
385
+ )
386
+
387
+
388
+ async def new_group(
389
+ ranks: List[int],
390
+ pg_options: Optional[ProcessGroup.Options] = None,
391
+ ):
392
+ """
393
+ Creates a new distributed group.
394
+
395
+ This function requires that all processes in the main group (i.e. all
396
+ processes that are part of the distributed job) enter this function, even
397
+ if they are not going to be members of the group. Additionally, groups
398
+ should be created in the same order in all processes.
399
+
400
+ Args:
401
+ ranks (list[int]): List of ranks of group members. If ``None``, will be
402
+ set to all ranks. Default is ``None``.
403
+
404
+ pg_options (ProcessGroupOptions, optional): process group options
405
+ specifying what additional options need to be passed in during
406
+ the construction of specific process groups.
407
+
408
+ Returns:
409
+ A handle of distributed group that can be given to collective calls.
410
+ """
411
+ address = os.environ.get(RANK_ADDRESS_ENV_KEY, None)
412
+ if address is None:
413
+ raise RuntimeError(INVOKE_ERROR_MESSAGE)
414
+ ref = await actor_ref(address=address, uid=f"RankActor")
415
+ return await ref.new_group(ranks, pg_options)
416
+
417
+
418
+ async def reduce(
419
+ send_data: Any,
420
+ recv_data: Any,
421
+ op: CollectiveReduceOp = CollectiveReduceOp.SUM,
422
+ root: Optional[int] = 0,
423
+ tag: Optional[int] = 0,
424
+ group_name: str = "default",
425
+ stream: Optional[Any] = None,
426
+ ):
427
+ """
428
+ Reduces the numpy or cupy data across all machines.
429
+
430
+ Only the process with rank ``root`` is going to receive the final result.
431
+
432
+ Args:
433
+ send_data (Any): Input of the collective. The function
434
+ operates in-place.
435
+
436
+ recv_data (Any): Output of the collective. The function
437
+ operates in-place.
438
+
439
+ root (int): Destination rank
440
+
441
+ op (xoscar.collective.common.CollectiveReduceOp): One of the values from
442
+ ``xoscar.collective.common.CollectiveReduceOp``
443
+ enum. Specifies an operation used for element-wise reductions.
444
+ Default is ``xoscar.collective.common.CollectiveReduceOp.SUM``.
445
+
446
+ tag (int optional): Tag for this operation. Default is 0.
447
+
448
+ group_name (str): The process group to work on. If None,
449
+ the default process group will be used.
450
+
451
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
452
+ """
453
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
454
+ ref = await actor_ref(address=address, uid=f"RankActor")
455
+ await ref.reduce(
456
+ send_data,
457
+ recv_data,
458
+ op=op,
459
+ root=root,
460
+ tag=tag,
461
+ pg_name=group_name,
462
+ stream=stream,
463
+ )
464
+
465
+
466
+ async def allreduce(
467
+ send_data: Any,
468
+ recv_data: Any,
469
+ op: CollectiveReduceOp = CollectiveReduceOp.SUM,
470
+ tag: Optional[int] = 0,
471
+ group_name: str = "default",
472
+ stream: Optional[Any] = None,
473
+ ):
474
+ """
475
+ Reduces the numpy or cupy data across all machines in such a way that all get
476
+ the final result.
477
+
478
+ Args:
479
+ send_data (Any): Input of the collective. The function
480
+ operates in-place.
481
+
482
+ recv_data (Any): Output of the collective. The function
483
+ operates in-place.
484
+
485
+ op (xoscar.collective.common.CollectiveReduceOp): One of the values from
486
+ ``xoscar.collective.common.CollectiveReduceOp``
487
+ enum. Specifies an operation used for element-wise reductions.
488
+ Default is ``xoscar.collective.common.CollectiveReduceOp.SUM``.
489
+
490
+ algorithm (xoscar.collective.common.AllReduceAlgorithm): One of the values from
491
+ ``xoscar.collective.common.AllReduceAlgorithm``
492
+ enum. Specifies an algorithm used for element-wise reductions.
493
+ Default is ``xoscar.collective.common.AllReduceAlgorithm.RING``.
494
+
495
+ tag (int optional): Tag for this operation. Default is 0.
496
+
497
+ group_name (str): The process group to work on. If None,
498
+ the default process group will be used.
499
+
500
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
501
+ """
502
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
503
+ ref = await actor_ref(address=address, uid="RankActor")
504
+ await ref.allreduce(
505
+ send_data,
506
+ recv_data,
507
+ op=op,
508
+ algorithm=AllReduceAlgorithm.RING,
509
+ tag=tag,
510
+ pg_name=group_name,
511
+ stream=stream,
512
+ )
513
+
514
+
515
+ async def gather(
516
+ send_data: Any,
517
+ recv_data: Any,
518
+ root: Optional[int] = 0,
519
+ tag: Optional[int] = 0,
520
+ group_name: str = "default",
521
+ stream: Optional[Any] = None,
522
+ ):
523
+ """
524
+ Gathers a list of numpy or cupy data in a single process.
525
+
526
+ Args:
527
+ send_data (Any): Input data.
528
+
529
+ recv_data (Any): Output data.
530
+
531
+ root (int, optional): Destination rank. Default is 0.
532
+
533
+ tag (int optional): Tag for this operation. Default is 0.
534
+
535
+ group_name (str): The process group to work on. If None,
536
+ the default process group will be used.
537
+
538
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
539
+ """
540
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
541
+ ref = await actor_ref(address=address, uid=f"RankActor")
542
+ await ref.gather(
543
+ send_data,
544
+ recv_data,
545
+ root=root,
546
+ tag=tag,
547
+ pg_name=group_name,
548
+ stream=stream,
549
+ )
550
+
551
+
552
+ async def allgather(
553
+ send_data: Any,
554
+ recv_data: Any,
555
+ tag: Optional[int] = 0,
556
+ group_name: str = "default",
557
+ stream: Optional[Any] = None,
558
+ ):
559
+ """
560
+ Gathers a list of numpy or cupy data to all devices.
561
+
562
+ Args:
563
+ send_data (Any): Input data.
564
+
565
+ recv_data (Any): Output data.
566
+
567
+ tag (int optional): Tag for this operation. Default is 0.
568
+
569
+ group_name (str): The process group to work on. If None,
570
+ the default process group will be used.
571
+
572
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
573
+ """
574
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
575
+ ref = await actor_ref(address=address, uid=f"RankActor")
576
+ await ref.allgather(
577
+ send_data,
578
+ recv_data,
579
+ tag=tag,
580
+ pg_name=group_name,
581
+ stream=stream,
582
+ )
583
+
584
+
585
+ async def scatter(
586
+ send_data: List[Any],
587
+ recv_data: Any,
588
+ root: Optional[int] = 0,
589
+ tag: Optional[int] = 0,
590
+ group_name: str = "default",
591
+ stream: Optional[Any] = None,
592
+ ):
593
+ """
594
+ Scatters a list of numpy or cupy data to all processes in a group.
595
+
596
+ Each process will receive exactly one tensor and store its data in the
597
+ recv_data.
598
+
599
+ Args:
600
+ send_data (List(Any)): Input data.
601
+
602
+ recv_data (Any): Output data.
603
+
604
+ root (int, optional): Source rank (default is 0).
605
+
606
+ tag (int optional): Tag for this operation. Default is 0.
607
+
608
+ group_name (str): The process group to work on. If None,
609
+ the default process group will be used.
610
+
611
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
612
+ """
613
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
614
+ ref = await actor_ref(address=address, uid=f"RankActor")
615
+ await ref.scatter(
616
+ send_data,
617
+ recv_data,
618
+ root=root,
619
+ tag=tag,
620
+ pg_name=group_name,
621
+ stream=stream,
622
+ )
623
+
624
+
625
+ async def reduce_scatter(
626
+ send_data: Any,
627
+ recv_data: Any,
628
+ recv_elems: List[int],
629
+ op: CollectiveReduceOp = CollectiveReduceOp.SUM,
630
+ group_name: str = "default",
631
+ stream: Optional[Any] = None,
632
+ ):
633
+ """
634
+ Reduces, then scatters a list of numpy or cupy data to all processes in a group.
635
+
636
+ Args:
637
+ send_data (Any): Input data.
638
+
639
+ recv_data (Any): Output data.
640
+
641
+ recv_elems (List[int]): the size of recv data for each process
642
+
643
+ op (xoscar.collective.common.CollectiveReduceOp): One of the values from
644
+ ``xoscar.collective.common.CollectiveReduceOp``
645
+ enum. Specifies an operation used for element-wise reductions.
646
+ Default is ``xoscar.collective.common.CollectiveReduceOp.SUM``.
647
+
648
+ group_name (str): The process group to work on. If None,
649
+ the default process group will be used.
650
+
651
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
652
+ """
653
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
654
+ ref = await actor_ref(address=address, uid=f"RankActor")
655
+ await ref.reduce_scatter(
656
+ send_data,
657
+ recv_data,
658
+ recv_elems,
659
+ op,
660
+ pg_name=group_name,
661
+ stream=stream,
662
+ )
663
+
664
+
665
+ async def alltoall(
666
+ send_data: Any,
667
+ recv_data: Any,
668
+ tag: Optional[int] = 0,
669
+ group_name: str = "default",
670
+ stream: Optional[Any] = None,
671
+ ):
672
+ """
673
+ Each process scatters list of numpy or cupy data to all processes in a group
674
+
675
+ Complex tensors are supported.
676
+
677
+ Args:
678
+ send_data (Any): Input data.
679
+
680
+ recv_data (Any): Output data.
681
+
682
+ tag (int, optional): Tag for this operation. default is 0.
683
+
684
+ group_name (str): The process group to work on. If None,
685
+ the default process group will be used.
686
+
687
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
688
+ """
689
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
690
+ ref = await actor_ref(address=address, uid=f"RankActor")
691
+ await ref.alltoall(
692
+ send_data,
693
+ recv_data,
694
+ tag=tag,
695
+ pg_name=group_name,
696
+ stream=stream,
697
+ )
698
+
699
+
700
+ async def broadcast(
701
+ send_data: Any,
702
+ recv_data: Any,
703
+ root: Optional[int] = 0,
704
+ tag: Optional[int] = 0,
705
+ group_name: str = "default",
706
+ stream: Optional[Any] = None,
707
+ ):
708
+ """
709
+ Broadcasts the tensor to the whole group.
710
+
711
+ data must have the same number of elements in all processes
712
+ participating in the collective.
713
+
714
+ Args:
715
+ send_data (Any): Input data.
716
+
717
+ recv_data (Any): Output data.
718
+
719
+ root (int, optional): Source rank. Default is 0.
720
+
721
+ tag (int, optional): Tag for this operation. Default is 0.
722
+
723
+ group_name (str): The process group to work on. If None,
724
+ the default process group will be used.
725
+
726
+ stream (cupy.cuda.Stream, optional): stream handle for nccl, default is None.
727
+ """
728
+ address = get_rank_address_via_env(RANK_ADDRESS_ENV_KEY, INVOKE_ERROR_MESSAGE)
729
+ ref = await actor_ref(address=address, uid=f"RankActor")
730
+ await ref.broadcast(
731
+ send_data,
732
+ recv_data,
733
+ root,
734
+ tag,
735
+ pg_name=group_name,
736
+ stream=stream,
737
+ )