torchmonarch-nightly 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.26__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +878 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +303 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +508 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +59 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +53 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +21 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/gradient/_gradient_generator.so +0 -0
  37. monarch/mesh_controller.py +263 -139
  38. monarch/monarch_controller +0 -0
  39. monarch/opaque_module.py +4 -6
  40. monarch/opaque_object.py +3 -3
  41. monarch/proc_mesh.py +6 -309
  42. monarch/python_local_mesh.py +1 -1
  43. monarch/rust_backend_mesh.py +2 -1
  44. monarch/rust_local_mesh.py +4 -2
  45. monarch/sim_mesh.py +10 -19
  46. monarch/simulator/command_history.py +1 -1
  47. monarch/simulator/interface.py +2 -1
  48. monarch/simulator/mock_controller.py +1 -1
  49. monarch/simulator/simulator.py +1 -1
  50. monarch/tensor_engine/__init__.py +23 -0
  51. monarch/tensor_worker_main.py +3 -1
  52. monarch/tools/cli.py +3 -1
  53. monarch/tools/commands.py +129 -47
  54. monarch/tools/components/hyperactor.py +5 -3
  55. monarch/tools/config/__init__.py +18 -1
  56. monarch/tools/config/defaults.py +2 -2
  57. monarch/tools/mesh_spec.py +59 -1
  58. monarch/tools/utils.py +38 -0
  59. monarch/worker/worker.py +1 -1
  60. monarch/world_mesh.py +2 -1
  61. monarch_supervisor/python_executable.py +6 -3
  62. tests/error_test_binary.py +48 -10
  63. tests/test_actor_error.py +370 -21
  64. tests/test_alloc.py +1 -1
  65. tests/test_allocator.py +369 -17
  66. tests/test_controller.py +2 -0
  67. tests/test_debugger.py +416 -0
  68. tests/test_env_before_cuda.py +161 -0
  69. tests/test_python_actors.py +184 -333
  70. tests/test_rdma.py +198 -0
  71. tests/test_remote_functions.py +40 -12
  72. tests/test_rust_backend.py +7 -5
  73. tests/test_sim_backend.py +1 -4
  74. tests/test_tensor_engine.py +81 -1
  75. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
  76. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
  77. torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
  78. monarch/_monarch/hyperactor/__init__.py +0 -58
  79. monarch/_monarch/worker/debugger.py +0 -117
  80. monarch/_monarch/worker/logging.py +0 -107
  81. monarch/debugger.py +0 -379
  82. monarch/future.py +0 -76
  83. monarch/rdma.py +0 -162
  84. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  85. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  86. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  87. /monarch/{common → _src/actor}/shape.py +0 -0
  88. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  89. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
  90. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
  91. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0
tests/test_allocator.py CHANGED
@@ -6,14 +6,17 @@
6
6
 
7
7
  # pyre-strict
8
8
 
9
+ import asyncio
9
10
  import contextlib
10
11
  import importlib.resources
12
+ import logging
11
13
  import math
12
14
  import os
13
15
  import subprocess
14
16
  import sys
15
17
  import unittest
16
18
  from datetime import timedelta
19
+ from time import sleep
17
20
  from typing import Generator, Optional
18
21
  from unittest import mock
19
22
 
@@ -24,22 +27,28 @@ import torch
24
27
  import torch.distributed as dist
25
28
  import torch.nn.functional as F
26
29
 
27
- from monarch._rust_bindings.hyperactor_extension.alloc import (
28
- AllocConstraints,
29
- AllocSpec,
30
- )
30
+ from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
31
31
  from monarch._rust_bindings.monarch_hyperactor.channel import (
32
32
  ChannelAddr,
33
33
  ChannelTransport,
34
34
  )
35
- from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
36
- from monarch.allocator import (
35
+
36
+ from monarch._src.actor.allocator import (
37
37
  ALLOC_LABEL_PROC_MESH_NAME,
38
+ LocalAllocator,
38
39
  RemoteAllocator,
39
40
  StaticRemoteAllocInitializer,
40
41
  TorchXRemoteAllocInitializer,
41
42
  )
42
- from monarch.proc_mesh import ProcMesh
43
+ from monarch._src.actor.sync_state import fake_sync_state
44
+ from monarch.actor import (
45
+ Actor,
46
+ current_rank,
47
+ current_size,
48
+ endpoint,
49
+ ProcMesh,
50
+ ValueMesh,
51
+ )
43
52
  from monarch.tools.mesh_spec import MeshSpec, ServerSpec
44
53
  from monarch.tools.network import get_sockaddr
45
54
 
@@ -49,6 +58,19 @@ from torchx.specs import AppState
49
58
  _100_MILLISECONDS = timedelta(milliseconds=100)
50
59
 
51
60
  SERVER_READY = "monarch.tools.commands.server_ready"
61
+ UNUSED = "__UNUSED__"
62
+
63
+
64
+ class EnvCheckActor(Actor):
65
+ """Actor that checks for the presence of an environment variable"""
66
+
67
+ def __init__(self) -> None:
68
+ pass
69
+
70
+ @endpoint
71
+ async def get_env_var(self, var_name: str) -> str:
72
+ """Return the value of the specified environment variable or 'NOT_SET' if not found"""
73
+ return os.environ.get(var_name, "NOT_SET")
52
74
 
53
75
 
54
76
  class TestActor(Actor):
@@ -57,6 +79,8 @@ class TestActor(Actor):
57
79
  def __init__(self) -> None:
58
80
  self.rank: int = current_rank().rank
59
81
  self.world_size: int = math.prod(current_size().values())
82
+ self.logger: logging.Logger = logging.getLogger("test_actor")
83
+ self.logger.setLevel(logging.INFO)
60
84
 
61
85
  @endpoint
62
86
  async def compute_world_size(self, master_addr: str, master_port: int) -> int:
@@ -71,17 +95,33 @@ class TestActor(Actor):
71
95
  finally:
72
96
  dist.destroy_process_group()
73
97
 
98
+ @endpoint
99
+ async def log(self, message: str) -> None:
100
+ print(f"Stdout LogMessage from print: {message}")
101
+ sys.stderr.write(f"Stderr LogMessage from print: {message}\n")
102
+ self.logger.info(f"LogMessage from logger: {message}")
103
+
74
104
 
75
105
  @contextlib.contextmanager
76
- def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None, None]:
77
- with importlib.resources.path(__package__, "") as package_path:
106
+ def remote_process_allocator(
107
+ addr: Optional[str] = None, timeout: Optional[int] = None
108
+ ) -> Generator[str, None, None]:
109
+ """Start a remote process allocator on addr. If timeout is not None, have it
110
+ timeout after that many seconds if no messages come in"""
111
+
112
+ with importlib.resources.as_file(
113
+ importlib.resources.files(__package__)
114
+ ) as package_path:
78
115
  addr = addr or ChannelAddr.any(ChannelTransport.Unix)
116
+ args = [
117
+ "process_allocator",
118
+ f"--addr={addr}",
119
+ ]
120
+ if timeout is not None:
121
+ args.append(f"--timeout-sec={timeout}")
79
122
 
80
123
  process_allocator = subprocess.Popen(
81
- args=[
82
- "process_allocator",
83
- f"--addr={addr}",
84
- ],
124
+ args=args,
85
125
  env={
86
126
  # prefix PATH with this test module's directory to
87
127
  # give 'process_allocator' and 'monarch_bootstrap' binary resources
@@ -102,6 +142,79 @@ def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None,
102
142
  process_allocator.kill()
103
143
 
104
144
 
145
+ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
146
+ @classmethod
147
+ def setUpClass(cls) -> None:
148
+ cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
149
+
150
+ @classmethod
151
+ def tearDownClass(cls) -> None:
152
+ cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
153
+
154
+ async def test_setup_lambda_with_multiple_env_vars(self) -> None:
155
+ """Test that the setup lambda can set multiple environment variables"""
156
+ env_vars: dict[str, str] = {
157
+ "TEST_ENV_VAR_1": "value_1",
158
+ "TEST_ENV_VAR_2": "value_2",
159
+ "TEST_ENV_VAR_3": "value_3",
160
+ }
161
+
162
+ def setup_multiple_env_vars() -> None:
163
+ for name, value in env_vars.items():
164
+ os.environ[name] = value
165
+
166
+ spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
167
+ allocator = LocalAllocator()
168
+ alloc = await allocator.allocate(spec)
169
+
170
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
171
+
172
+ try:
173
+ actor = await proc_mesh.spawn("env_check", EnvCheckActor)
174
+
175
+ for name, expected_value in env_vars.items():
176
+ actual_value = await actor.get_env_var.call_one(name)
177
+ self.assertEqual(
178
+ actual_value,
179
+ expected_value,
180
+ f"Environment variable {name} was not set correctly",
181
+ )
182
+ finally:
183
+ await proc_mesh.stop()
184
+
185
+ async def test_setup_lambda_with_context_info(self) -> None:
186
+ """Test that the setup lambda can access rank information"""
187
+ context_var_name: str = "PROC_MESH_RANK_INFO"
188
+
189
+ def setup_with_rank() -> None:
190
+ context_info = f"point_rank:{current_rank().rank}"
191
+ os.environ[context_var_name] = context_info
192
+
193
+ spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
194
+ allocator = LocalAllocator()
195
+ alloc = await allocator.allocate(spec)
196
+
197
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_rank)
198
+
199
+ try:
200
+ actor = await proc_mesh.spawn("env_check", EnvCheckActor)
201
+
202
+ rank_info = await actor.get_env_var.call_one(context_var_name)
203
+
204
+ self.assertNotEqual(
205
+ rank_info,
206
+ "NOT_SET",
207
+ "Context information was not stored in the environment variable",
208
+ )
209
+ self.assertIn(
210
+ "point_rank:0",
211
+ rank_info,
212
+ f"Context information {rank_info} does not contain point_rank",
213
+ )
214
+ finally:
215
+ await proc_mesh.stop()
216
+
217
+
105
218
  class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
106
219
  @classmethod
107
220
  def setUpClass(cls) -> None:
@@ -153,7 +266,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
153
266
  """test initializer that returns an empty list of addresses"""
154
267
 
155
268
  async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
156
- _ = match_labels # Suppress unused variable warning
269
+ _ = match_labels
157
270
  return []
158
271
 
159
272
  empty_initializer = EmptyAllocInitializer()
@@ -191,6 +304,209 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
191
304
 
192
305
  self.assert_computed_world_size(values, world_size)
193
306
 
307
+ async def test_stop_proc_mesh_blocking(self) -> None:
308
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
309
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
310
+ allocator = RemoteAllocator(
311
+ world_id="test_remote_allocator",
312
+ initializer=StaticRemoteAllocInitializer(host1, host2),
313
+ heartbeat_interval=_100_MILLISECONDS,
314
+ )
315
+
316
+ alloc = await allocator.allocate(spec)
317
+ proc_mesh = await ProcMesh.from_alloc(alloc)
318
+ # XXX - it is not clear why this trying to use
319
+ # async code in a sync context.
320
+ with fake_sync_state():
321
+ actor = proc_mesh.spawn("test_actor", TestActor).get()
322
+ proc_mesh.stop().get()
323
+ with self.assertRaises(
324
+ RuntimeError, msg="`ProcMesh` has already been stopped"
325
+ ):
326
+ proc_mesh.spawn("test_actor", TestActor).get()
327
+ del actor
328
+
329
+ async def test_wrong_address(self) -> None:
330
+ hosts = 1
331
+ gpus = 1
332
+ spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
333
+
334
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
335
+ with remote_process_allocator():
336
+ wrong_host = ChannelAddr.any(ChannelTransport.Unix)
337
+ allocator = RemoteAllocator(
338
+ world_id="test_remote_allocator",
339
+ initializer=StaticRemoteAllocInitializer(wrong_host),
340
+ heartbeat_interval=_100_MILLISECONDS,
341
+ )
342
+ alloc = await allocator.allocate(spec)
343
+
344
+ with self.assertRaisesRegex(
345
+ Exception, r"no process has ever been allocated.*"
346
+ ):
347
+ await ProcMesh.from_alloc(alloc)
348
+
349
+ async def test_init_failure(self) -> None:
350
+ class FailInitActor(Actor):
351
+ def __init__(self) -> None:
352
+ if current_rank().rank == 0:
353
+ raise RuntimeError("fail on init")
354
+
355
+ @endpoint
356
+ def dummy(self) -> None:
357
+ pass
358
+
359
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
360
+ allocator = RemoteAllocator(
361
+ world_id="helloworld",
362
+ initializer=StaticRemoteAllocInitializer(host1, host2),
363
+ heartbeat_interval=_100_MILLISECONDS,
364
+ )
365
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=2)
366
+ proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
367
+ actor_mesh = await proc_mesh.spawn("actor", FailInitActor)
368
+
369
+ with self.assertRaisesRegex(
370
+ Exception,
371
+ r"(?s)fail on init",
372
+ ):
373
+ await actor_mesh.dummy.call()
374
+
375
+ async def test_stop_proc_mesh(self) -> None:
376
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
377
+
378
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
379
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
380
+ allocator = RemoteAllocator(
381
+ world_id="test_remote_allocator",
382
+ initializer=StaticRemoteAllocInitializer(host1, host2),
383
+ heartbeat_interval=_100_MILLISECONDS,
384
+ )
385
+ alloc = await allocator.allocate(spec)
386
+ proc_mesh = await ProcMesh.from_alloc(alloc)
387
+ actor = await proc_mesh.spawn("test_actor", TestActor)
388
+
389
+ await proc_mesh.stop()
390
+
391
+ with self.assertRaises(
392
+ RuntimeError, msg="`ProcMesh` has already been stopped"
393
+ ):
394
+ await proc_mesh.spawn("test_actor", TestActor)
395
+
396
+ # TODO(agallagher): It'd be nice to test that this just fails
397
+ # immediately, trying to access the wrapped actor mesh, but right
398
+ # now we doing casting without accessing the wrapped type.
399
+ del actor
400
+
401
+ async def test_stop_proc_mesh_context_manager(self) -> None:
402
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
403
+
404
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
405
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
406
+ allocator = RemoteAllocator(
407
+ world_id="test_remote_allocator",
408
+ initializer=StaticRemoteAllocInitializer(host1, host2),
409
+ heartbeat_interval=_100_MILLISECONDS,
410
+ )
411
+ alloc = await allocator.allocate(spec)
412
+ proc_mesh = await ProcMesh.from_alloc(alloc)
413
+ with self.assertRaises(ValueError, msg="foo"):
414
+ async with proc_mesh:
415
+ actor = await proc_mesh.spawn("test_actor", TestActor)
416
+ # Ensure that proc mesh is stopped when context manager exits.
417
+ raise ValueError("foo")
418
+
419
+ with self.assertRaises(
420
+ RuntimeError, msg="`ProcMesh` has already been stopped"
421
+ ):
422
+ await proc_mesh.spawn("test_actor", TestActor)
423
+
424
+ # TODO(agallagher): It'd be nice to test that this just fails
425
+ # immediately, trying to access the wrapped actor mesh, but right
426
+ # now we doing casting without accessing the wrapped type.
427
+ del actor
428
+
429
+ async def test_setup_lambda_sets_env_vars(self) -> None:
430
+ """Test that the setup lambda can set environment variables during proc_mesh allocation"""
431
+ test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
432
+ test_var_value: str = "test_value_123"
433
+
434
+ def setup_env_vars() -> None:
435
+ os.environ[test_var_name] = test_var_value
436
+
437
+ hosts = 2
438
+ gpus = 4
439
+ spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
440
+
441
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
442
+ allocator = RemoteAllocator(
443
+ world_id="test_remote_allocator",
444
+ initializer=StaticRemoteAllocInitializer(host1, host2),
445
+ heartbeat_interval=_100_MILLISECONDS,
446
+ )
447
+ alloc = await allocator.allocate(spec)
448
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_env_vars)
449
+
450
+ try:
451
+ actor = await proc_mesh.spawn("env_check", EnvCheckActor)
452
+
453
+ env_var_values = await actor.get_env_var.call(test_var_name)
454
+ env_var_value = env_var_values.item(host=0, gpu=0)
455
+
456
+ self.assertEqual(
457
+ env_var_value,
458
+ test_var_value,
459
+ f"Environment variable {test_var_name} was not set correctly",
460
+ )
461
+ finally:
462
+ await proc_mesh.stop()
463
+
464
+ async def test_stop_proc_mesh_context_manager_multiple_times(self) -> None:
465
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
466
+
467
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
468
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
469
+ allocator = RemoteAllocator(
470
+ world_id="test_remote_allocator",
471
+ initializer=StaticRemoteAllocInitializer(host1, host2),
472
+ heartbeat_interval=_100_MILLISECONDS,
473
+ )
474
+ alloc = await allocator.allocate(spec)
475
+ proc_mesh = await ProcMesh.from_alloc(alloc)
476
+ # We can nest multiple context managers on the same mesh, the innermost
477
+ # one closes the mesh and it cannot be used after that.
478
+ async with proc_mesh:
479
+ async with proc_mesh:
480
+ actor = await proc_mesh.spawn("test_actor", TestActor)
481
+
482
+ with self.assertRaises(
483
+ RuntimeError, msg="`ProcMesh` has already been stopped"
484
+ ):
485
+ await proc_mesh.spawn("test_actor", TestActor)
486
+ # Exiting a second time should not raise an error.
487
+
488
+ # TODO(agallagher): It'd be nice to test that this just fails
489
+ # immediately, trying to access the wrapped actor mesh, but right
490
+ # now we doing casting without accessing the wrapped type.
491
+ del actor
492
+
493
+ async def test_remote_allocator_with_no_connection(self) -> None:
494
+ spec = AllocSpec(AllocConstraints(), host=1, gpu=4)
495
+
496
+ with remote_process_allocator(timeout=1) as host1:
497
+ # Wait 3 seconds without making any processes, make sure it dies.
498
+ await asyncio.sleep(3)
499
+ allocator = RemoteAllocator(
500
+ world_id="test_remote_allocator",
501
+ initializer=StaticRemoteAllocInitializer(host1),
502
+ heartbeat_interval=_100_MILLISECONDS,
503
+ )
504
+ with self.assertRaisesRegex(
505
+ Exception, "no process has ever been allocated on"
506
+ ):
507
+ alloc = await allocator.allocate(spec)
508
+ await ProcMesh.from_alloc(alloc)
509
+
194
510
  async def test_stacked_1d_meshes(self) -> None:
195
511
  # create two stacked actor meshes on the same host
196
512
  # each actor mesh running on separate process-allocators
@@ -244,7 +560,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
244
560
  # but there are more than 1 mesh (hence ambiguous which mesh to allocate on)
245
561
 
246
562
  server = ServerSpec(
247
- name="__UNUSED__",
563
+ name=UNUSED,
564
+ scheduler=UNUSED,
248
565
  state=AppState.RUNNING,
249
566
  meshes=[MeshSpec(name="x", num_hosts=1), MeshSpec(name="y", num_hosts=1)],
250
567
  )
@@ -262,7 +579,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
262
579
  @pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
263
580
  async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
264
581
  server = ServerSpec(
265
- name="__UNUSED__",
582
+ name=UNUSED,
583
+ scheduler=UNUSED,
266
584
  state=AppState.RUNNING,
267
585
  meshes=[
268
586
  MeshSpec(
@@ -295,7 +613,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
295
613
  @pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
296
614
  async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
297
615
  server = ServerSpec(
298
- name="__UNUSED__",
616
+ name=UNUSED,
617
+ scheduler=UNUSED,
299
618
  state=AppState.RUNNING,
300
619
  meshes=[
301
620
  MeshSpec(
@@ -338,6 +657,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
338
657
 
339
658
  server = ServerSpec(
340
659
  name="test",
660
+ scheduler=UNUSED,
341
661
  state=AppState.RUNNING,
342
662
  meshes=[
343
663
  MeshSpec(
@@ -363,3 +683,35 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
363
683
  )
364
684
  )
365
685
  await ProcMesh.from_alloc(alloc)
686
+
687
+ async def test_log(self) -> None:
688
+ # create a mesh to log to both stdout and stderr
689
+
690
+ with remote_process_allocator() as host:
691
+ allocator = RemoteAllocator(
692
+ world_id="test_actor_logger",
693
+ initializer=StaticRemoteAllocInitializer(host),
694
+ heartbeat_interval=_100_MILLISECONDS,
695
+ )
696
+
697
+ spec = AllocSpec(AllocConstraints(), host=1, gpu=2)
698
+
699
+ proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
700
+
701
+ # Generate aggregated log every 1 second.
702
+ await proc_mesh.logging_option(True, 1)
703
+ actor = await proc_mesh.spawn("actor", TestActor)
704
+ # Run for 4 seconds, every second generates 5 logs, so we expect to see
705
+ # 2 actors x 5 logs/actor/sec * 1 sec = 10 logs per aggregation.
706
+ for _ in range(20):
707
+ await actor.log.call("Expect to see [10 processes]")
708
+ sleep(0.2)
709
+ # Generate aggregated log every 2 seconds.
710
+ await proc_mesh.logging_option(True, 2)
711
+ # Run for 8 seconds, every second generates 5 logs, so we expect to see
712
+ # 2 actors x 5 logs/actor/sec * 2 sec = 20 logs per aggregation.
713
+ for _ in range(40):
714
+ await actor.log.call("Expect to see [20 processes]")
715
+ sleep(0.2)
716
+
717
+ print("======== All Done ========")
tests/test_controller.py CHANGED
@@ -653,6 +653,8 @@ def test_panicking_worker():
653
653
  _ = fetch_shard(torch.ones(2, 3)).result()
654
654
 
655
655
 
656
+ # TODO - re-enable after resolving T232206970
657
+ @pytest.mark.oss_skip
656
658
  def test_timeout_warning(caplog):
657
659
  timeout = 3
658
660
  with local_rust_device_mesh(