torchmonarch-nightly 2025.7.1__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +874 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +270 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +500 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +56 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +51 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +12 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/mesh_controller.py +201 -139
  37. monarch/monarch_controller +0 -0
  38. monarch/opaque_module.py +4 -6
  39. monarch/opaque_object.py +3 -3
  40. monarch/proc_mesh.py +6 -309
  41. monarch/python_local_mesh.py +1 -1
  42. monarch/rust_backend_mesh.py +2 -1
  43. monarch/rust_local_mesh.py +4 -2
  44. monarch/sim_mesh.py +10 -19
  45. monarch/simulator/command_history.py +1 -1
  46. monarch/simulator/interface.py +2 -1
  47. monarch/simulator/mock_controller.py +1 -1
  48. monarch/simulator/simulator.py +1 -1
  49. monarch/tensor_engine/__init__.py +23 -0
  50. monarch/tensor_worker_main.py +3 -1
  51. monarch/tools/cli.py +3 -1
  52. monarch/tools/commands.py +95 -35
  53. monarch/tools/mesh_spec.py +55 -0
  54. monarch/tools/utils.py +38 -0
  55. monarch/worker/worker.py +1 -1
  56. monarch/world_mesh.py +2 -1
  57. monarch_supervisor/python_executable.py +6 -3
  58. tests/error_test_binary.py +48 -10
  59. tests/test_actor_error.py +370 -21
  60. tests/test_alloc.py +1 -1
  61. tests/test_allocator.py +373 -17
  62. tests/test_controller.py +2 -0
  63. tests/test_debugger.py +416 -0
  64. tests/test_env_before_cuda.py +162 -0
  65. tests/test_python_actors.py +184 -333
  66. tests/test_rdma.py +198 -0
  67. tests/test_remote_functions.py +40 -12
  68. tests/test_rust_backend.py +7 -5
  69. tests/test_sim_backend.py +1 -4
  70. tests/test_tensor_engine.py +55 -1
  71. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
  72. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
  73. torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
  74. monarch/_monarch/hyperactor/__init__.py +0 -58
  75. monarch/_monarch/worker/debugger.py +0 -117
  76. monarch/_monarch/worker/logging.py +0 -107
  77. monarch/debugger.py +0 -379
  78. monarch/future.py +0 -76
  79. monarch/rdma.py +0 -162
  80. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  81. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  82. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  83. /monarch/{common → _src/actor}/shape.py +0 -0
  84. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  85. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
  86. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
  87. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
tests/test_allocator.py CHANGED
@@ -6,14 +6,17 @@
6
6
 
7
7
  # pyre-strict
8
8
 
9
+ import asyncio
9
10
  import contextlib
10
11
  import importlib.resources
12
+ import logging
11
13
  import math
12
14
  import os
13
15
  import subprocess
14
16
  import sys
15
17
  import unittest
16
18
  from datetime import timedelta
19
+ from time import sleep
17
20
  from typing import Generator, Optional
18
21
  from unittest import mock
19
22
 
@@ -24,22 +27,29 @@ import torch
24
27
  import torch.distributed as dist
25
28
  import torch.nn.functional as F
26
29
 
27
- from monarch._rust_bindings.hyperactor_extension.alloc import (
28
- AllocConstraints,
29
- AllocSpec,
30
- )
30
+ from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
31
31
  from monarch._rust_bindings.monarch_hyperactor.channel import (
32
32
  ChannelAddr,
33
33
  ChannelTransport,
34
34
  )
35
- from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
36
- from monarch.allocator import (
35
+
36
+ from monarch._src.actor.actor_mesh import MonarchContext
37
+ from monarch._src.actor.allocator import (
37
38
  ALLOC_LABEL_PROC_MESH_NAME,
39
+ LocalAllocator,
38
40
  RemoteAllocator,
39
41
  StaticRemoteAllocInitializer,
40
42
  TorchXRemoteAllocInitializer,
41
43
  )
42
- from monarch.proc_mesh import ProcMesh
44
+ from monarch._src.actor.sync_state import fake_sync_state
45
+ from monarch.actor import (
46
+ Actor,
47
+ current_rank,
48
+ current_size,
49
+ endpoint,
50
+ ProcMesh,
51
+ ValueMesh,
52
+ )
43
53
  from monarch.tools.mesh_spec import MeshSpec, ServerSpec
44
54
  from monarch.tools.network import get_sockaddr
45
55
 
@@ -49,6 +59,19 @@ from torchx.specs import AppState
49
59
  _100_MILLISECONDS = timedelta(milliseconds=100)
50
60
 
51
61
  SERVER_READY = "monarch.tools.commands.server_ready"
62
+ UNUSED = "__UNUSED__"
63
+
64
+
65
+ class EnvCheckActor(Actor):
66
+ """Actor that checks for the presence of an environment variable"""
67
+
68
+ def __init__(self) -> None:
69
+ pass
70
+
71
+ @endpoint
72
+ async def get_env_var(self, var_name: str) -> str:
73
+ """Return the value of the specified environment variable or 'NOT_SET' if not found"""
74
+ return os.environ.get(var_name, "NOT_SET")
52
75
 
53
76
 
54
77
  class TestActor(Actor):
@@ -57,6 +80,8 @@ class TestActor(Actor):
57
80
  def __init__(self) -> None:
58
81
  self.rank: int = current_rank().rank
59
82
  self.world_size: int = math.prod(current_size().values())
83
+ self.logger: logging.Logger = logging.getLogger("test_actor")
84
+ self.logger.setLevel(logging.INFO)
60
85
 
61
86
  @endpoint
62
87
  async def compute_world_size(self, master_addr: str, master_port: int) -> int:
@@ -71,17 +96,33 @@ class TestActor(Actor):
71
96
  finally:
72
97
  dist.destroy_process_group()
73
98
 
99
+ @endpoint
100
+ async def log(self, message: str) -> None:
101
+ print(f"Stdout LogMessage from print: {message}")
102
+ sys.stderr.write(f"Stderr LogMessage from print: {message}\n")
103
+ self.logger.info(f"LogMessage from logger: {message}")
104
+
74
105
 
75
106
  @contextlib.contextmanager
76
- def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None, None]:
77
- with importlib.resources.path(__package__, "") as package_path:
107
+ def remote_process_allocator(
108
+ addr: Optional[str] = None, timeout: Optional[int] = None
109
+ ) -> Generator[str, None, None]:
110
+ """Start a remote process allocator on addr. If timeout is not None, have it
111
+ timeout after that many seconds if no messages come in"""
112
+
113
+ with importlib.resources.as_file(
114
+ importlib.resources.files(__package__)
115
+ ) as package_path:
78
116
  addr = addr or ChannelAddr.any(ChannelTransport.Unix)
117
+ args = [
118
+ "process_allocator",
119
+ f"--addr={addr}",
120
+ ]
121
+ if timeout is not None:
122
+ args.append(f"--timeout-sec={timeout}")
79
123
 
80
124
  process_allocator = subprocess.Popen(
81
- args=[
82
- "process_allocator",
83
- f"--addr={addr}",
84
- ],
125
+ args=args,
85
126
  env={
86
127
  # prefix PATH with this test module's directory to
87
128
  # give 'process_allocator' and 'monarch_bootstrap' binary resources
@@ -102,6 +143,82 @@ def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None,
102
143
  process_allocator.kill()
103
144
 
104
145
 
146
+ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
147
+ @classmethod
148
+ def setUpClass(cls) -> None:
149
+ cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
150
+
151
+ @classmethod
152
+ def tearDownClass(cls) -> None:
153
+ cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
154
+
155
+ async def test_setup_lambda_with_multiple_env_vars(self) -> None:
156
+ """Test that the setup lambda can set multiple environment variables"""
157
+ env_vars: dict[str, str] = {
158
+ "TEST_ENV_VAR_1": "value_1",
159
+ "TEST_ENV_VAR_2": "value_2",
160
+ "TEST_ENV_VAR_3": "value_3",
161
+ }
162
+
163
+ def setup_multiple_env_vars(ctx: MonarchContext) -> None:
164
+ for name, value in env_vars.items():
165
+ os.environ[name] = value
166
+
167
+ spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
168
+ allocator = LocalAllocator()
169
+ alloc = await allocator.allocate(spec)
170
+
171
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
172
+
173
+ try:
174
+ actor = await proc_mesh.spawn("env_check", EnvCheckActor)
175
+
176
+ for name, expected_value in env_vars.items():
177
+ actual_value = await actor.get_env_var.call_one(name)
178
+ self.assertEqual(
179
+ actual_value,
180
+ expected_value,
181
+ f"Environment variable {name} was not set correctly",
182
+ )
183
+ finally:
184
+ await proc_mesh.stop()
185
+
186
+ async def test_setup_lambda_with_context_info(self) -> None:
187
+ """Test that the setup lambda can access context information"""
188
+ context_var_name: str = "PROC_MESH_CONTEXT_INFO"
189
+
190
+ def setup_with_context(ctx: MonarchContext) -> None:
191
+ context_info = f"proc_id:{ctx.proc_id},point_rank:{ctx.point.rank}"
192
+ os.environ[context_var_name] = context_info
193
+
194
+ spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
195
+ allocator = LocalAllocator()
196
+ alloc = await allocator.allocate(spec)
197
+
198
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_context)
199
+
200
+ try:
201
+ actor = await proc_mesh.spawn("env_check", EnvCheckActor)
202
+
203
+ context_info = await actor.get_env_var.call_one(context_var_name)
204
+
205
+ self.assertNotEqual(
206
+ context_info,
207
+ "NOT_SET",
208
+ "Context information was not stored in the environment variable",
209
+ )
210
+ self.assertIn(
211
+ "proc_id:", context_info, "Context information does not contain proc_id"
212
+ )
213
+ self.assertIn(
214
+ "point_rank:0",
215
+ context_info,
216
+ f"Context information {context_info} does not contain point_rank",
217
+ )
218
+ finally:
219
+ await proc_mesh.stop()
220
+
221
+
105
222
  class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
106
223
  @classmethod
107
224
  def setUpClass(cls) -> None:
@@ -153,7 +270,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
153
270
  """test initializer that returns an empty list of addresses"""
154
271
 
155
272
  async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
156
- _ = match_labels # Suppress unused variable warning
273
+ _ = match_labels
157
274
  return []
158
275
 
159
276
  empty_initializer = EmptyAllocInitializer()
@@ -191,6 +308,209 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
191
308
 
192
309
  self.assert_computed_world_size(values, world_size)
193
310
 
311
+ async def test_stop_proc_mesh_blocking(self) -> None:
312
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
313
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
314
+ allocator = RemoteAllocator(
315
+ world_id="test_remote_allocator",
316
+ initializer=StaticRemoteAllocInitializer(host1, host2),
317
+ heartbeat_interval=_100_MILLISECONDS,
318
+ )
319
+
320
+ alloc = await allocator.allocate(spec)
321
+ proc_mesh = await ProcMesh.from_alloc(alloc)
322
+ # XXX - it is not clear why this trying to use
323
+ # async code in a sync context.
324
+ with fake_sync_state():
325
+ actor = proc_mesh.spawn("test_actor", TestActor).get()
326
+ proc_mesh.stop().get()
327
+ with self.assertRaises(
328
+ RuntimeError, msg="`ProcMesh` has already been stopped"
329
+ ):
330
+ proc_mesh.spawn("test_actor", TestActor).get()
331
+ del actor
332
+
333
+ async def test_wrong_address(self) -> None:
334
+ hosts = 1
335
+ gpus = 1
336
+ spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
337
+
338
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
339
+ with remote_process_allocator():
340
+ wrong_host = ChannelAddr.any(ChannelTransport.Unix)
341
+ allocator = RemoteAllocator(
342
+ world_id="test_remote_allocator",
343
+ initializer=StaticRemoteAllocInitializer(wrong_host),
344
+ heartbeat_interval=_100_MILLISECONDS,
345
+ )
346
+ alloc = await allocator.allocate(spec)
347
+
348
+ with self.assertRaisesRegex(
349
+ Exception, r"no process has ever been allocated.*"
350
+ ):
351
+ await ProcMesh.from_alloc(alloc)
352
+
353
+ async def test_init_failure(self) -> None:
354
+ class FailInitActor(Actor):
355
+ def __init__(self) -> None:
356
+ if current_rank().rank == 0:
357
+ raise RuntimeError("fail on init")
358
+
359
+ @endpoint
360
+ def dummy(self) -> None:
361
+ pass
362
+
363
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
364
+ allocator = RemoteAllocator(
365
+ world_id="helloworld",
366
+ initializer=StaticRemoteAllocInitializer(host1, host2),
367
+ heartbeat_interval=_100_MILLISECONDS,
368
+ )
369
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=2)
370
+ proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
371
+ actor_mesh = await proc_mesh.spawn("actor", FailInitActor)
372
+
373
+ with self.assertRaisesRegex(
374
+ Exception,
375
+ r"(?s)fail on init",
376
+ ):
377
+ await actor_mesh.dummy.call()
378
+
379
+ async def test_stop_proc_mesh(self) -> None:
380
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
381
+
382
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
383
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
384
+ allocator = RemoteAllocator(
385
+ world_id="test_remote_allocator",
386
+ initializer=StaticRemoteAllocInitializer(host1, host2),
387
+ heartbeat_interval=_100_MILLISECONDS,
388
+ )
389
+ alloc = await allocator.allocate(spec)
390
+ proc_mesh = await ProcMesh.from_alloc(alloc)
391
+ actor = await proc_mesh.spawn("test_actor", TestActor)
392
+
393
+ await proc_mesh.stop()
394
+
395
+ with self.assertRaises(
396
+ RuntimeError, msg="`ProcMesh` has already been stopped"
397
+ ):
398
+ await proc_mesh.spawn("test_actor", TestActor)
399
+
400
+ # TODO(agallagher): It'd be nice to test that this just fails
401
+ # immediately, trying to access the wrapped actor mesh, but right
402
+ # now we doing casting without accessing the wrapped type.
403
+ del actor
404
+
405
+ async def test_stop_proc_mesh_context_manager(self) -> None:
406
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
407
+
408
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
409
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
410
+ allocator = RemoteAllocator(
411
+ world_id="test_remote_allocator",
412
+ initializer=StaticRemoteAllocInitializer(host1, host2),
413
+ heartbeat_interval=_100_MILLISECONDS,
414
+ )
415
+ alloc = await allocator.allocate(spec)
416
+ proc_mesh = await ProcMesh.from_alloc(alloc)
417
+ with self.assertRaises(ValueError, msg="foo"):
418
+ async with proc_mesh:
419
+ actor = await proc_mesh.spawn("test_actor", TestActor)
420
+ # Ensure that proc mesh is stopped when context manager exits.
421
+ raise ValueError("foo")
422
+
423
+ with self.assertRaises(
424
+ RuntimeError, msg="`ProcMesh` has already been stopped"
425
+ ):
426
+ await proc_mesh.spawn("test_actor", TestActor)
427
+
428
+ # TODO(agallagher): It'd be nice to test that this just fails
429
+ # immediately, trying to access the wrapped actor mesh, but right
430
+ # now we doing casting without accessing the wrapped type.
431
+ del actor
432
+
433
+ async def test_setup_lambda_sets_env_vars(self) -> None:
434
+ """Test that the setup lambda can set environment variables during proc_mesh allocation"""
435
+ test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
436
+ test_var_value: str = "test_value_123"
437
+
438
+ def setup_env_vars(ctx: MonarchContext) -> None:
439
+ os.environ[test_var_name] = test_var_value
440
+
441
+ hosts = 2
442
+ gpus = 4
443
+ spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
444
+
445
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
446
+ allocator = RemoteAllocator(
447
+ world_id="test_remote_allocator",
448
+ initializer=StaticRemoteAllocInitializer(host1, host2),
449
+ heartbeat_interval=_100_MILLISECONDS,
450
+ )
451
+ alloc = await allocator.allocate(spec)
452
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_env_vars)
453
+
454
+ try:
455
+ actor = await proc_mesh.spawn("env_check", EnvCheckActor)
456
+
457
+ env_var_values = await actor.get_env_var.call(test_var_name)
458
+ env_var_value = env_var_values.item(host=0, gpu=0)
459
+
460
+ self.assertEqual(
461
+ env_var_value,
462
+ test_var_value,
463
+ f"Environment variable {test_var_name} was not set correctly",
464
+ )
465
+ finally:
466
+ await proc_mesh.stop()
467
+
468
+ async def test_stop_proc_mesh_context_manager_multiple_times(self) -> None:
469
+ spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
470
+
471
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
472
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
473
+ allocator = RemoteAllocator(
474
+ world_id="test_remote_allocator",
475
+ initializer=StaticRemoteAllocInitializer(host1, host2),
476
+ heartbeat_interval=_100_MILLISECONDS,
477
+ )
478
+ alloc = await allocator.allocate(spec)
479
+ proc_mesh = await ProcMesh.from_alloc(alloc)
480
+ # We can nest multiple context managers on the same mesh, the innermost
481
+ # one closes the mesh and it cannot be used after that.
482
+ async with proc_mesh:
483
+ async with proc_mesh:
484
+ actor = await proc_mesh.spawn("test_actor", TestActor)
485
+
486
+ with self.assertRaises(
487
+ RuntimeError, msg="`ProcMesh` has already been stopped"
488
+ ):
489
+ await proc_mesh.spawn("test_actor", TestActor)
490
+ # Exiting a second time should not raise an error.
491
+
492
+ # TODO(agallagher): It'd be nice to test that this just fails
493
+ # immediately, trying to access the wrapped actor mesh, but right
494
+ # now we doing casting without accessing the wrapped type.
495
+ del actor
496
+
497
+ async def test_remote_allocator_with_no_connection(self) -> None:
498
+ spec = AllocSpec(AllocConstraints(), host=1, gpu=4)
499
+
500
+ with remote_process_allocator(timeout=1) as host1:
501
+ # Wait 3 seconds without making any processes, make sure it dies.
502
+ await asyncio.sleep(3)
503
+ allocator = RemoteAllocator(
504
+ world_id="test_remote_allocator",
505
+ initializer=StaticRemoteAllocInitializer(host1),
506
+ heartbeat_interval=_100_MILLISECONDS,
507
+ )
508
+ with self.assertRaisesRegex(
509
+ Exception, "no process has ever been allocated on"
510
+ ):
511
+ alloc = await allocator.allocate(spec)
512
+ await ProcMesh.from_alloc(alloc)
513
+
194
514
  async def test_stacked_1d_meshes(self) -> None:
195
515
  # create two stacked actor meshes on the same host
196
516
  # each actor mesh running on separate process-allocators
@@ -244,7 +564,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
244
564
  # but there are more than 1 mesh (hence ambiguous which mesh to allocate on)
245
565
 
246
566
  server = ServerSpec(
247
- name="__UNUSED__",
567
+ name=UNUSED,
568
+ scheduler=UNUSED,
248
569
  state=AppState.RUNNING,
249
570
  meshes=[MeshSpec(name="x", num_hosts=1), MeshSpec(name="y", num_hosts=1)],
250
571
  )
@@ -262,7 +583,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
262
583
  @pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
263
584
  async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
264
585
  server = ServerSpec(
265
- name="__UNUSED__",
586
+ name=UNUSED,
587
+ scheduler=UNUSED,
266
588
  state=AppState.RUNNING,
267
589
  meshes=[
268
590
  MeshSpec(
@@ -295,7 +617,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
295
617
  @pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
296
618
  async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
297
619
  server = ServerSpec(
298
- name="__UNUSED__",
620
+ name=UNUSED,
621
+ scheduler=UNUSED,
299
622
  state=AppState.RUNNING,
300
623
  meshes=[
301
624
  MeshSpec(
@@ -338,6 +661,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
338
661
 
339
662
  server = ServerSpec(
340
663
  name="test",
664
+ scheduler=UNUSED,
341
665
  state=AppState.RUNNING,
342
666
  meshes=[
343
667
  MeshSpec(
@@ -363,3 +687,35 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
363
687
  )
364
688
  )
365
689
  await ProcMesh.from_alloc(alloc)
690
+
691
+ async def test_log(self) -> None:
692
+ # create a mesh to log to both stdout and stderr
693
+
694
+ with remote_process_allocator() as host:
695
+ allocator = RemoteAllocator(
696
+ world_id="test_actor_logger",
697
+ initializer=StaticRemoteAllocInitializer(host),
698
+ heartbeat_interval=_100_MILLISECONDS,
699
+ )
700
+
701
+ spec = AllocSpec(AllocConstraints(), host=1, gpu=2)
702
+
703
+ proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
704
+
705
+ # Generate aggregated log every 1 second.
706
+ await proc_mesh.logging_option(True, 1)
707
+ actor = await proc_mesh.spawn("actor", TestActor)
708
+ # Run for 4 seconds, every second generates 5 logs, so we expect to see
709
+ # 2 actors x 5 logs/actor/sec * 1 sec = 10 logs per aggregation.
710
+ for _ in range(20):
711
+ await actor.log.call("Expect to see [10 processes]")
712
+ sleep(0.2)
713
+ # Generate aggregated log every 2 seconds.
714
+ await proc_mesh.logging_option(True, 2)
715
+ # Run for 8 seconds, every second generates 5 logs, so we expect to see
716
+ # 2 actors x 5 logs/actor/sec * 2 sec = 20 logs per aggregation.
717
+ for _ in range(40):
718
+ await actor.log.call("Expect to see [20 processes]")
719
+ sleep(0.2)
720
+
721
+ print("======== All Done ========")
tests/test_controller.py CHANGED
@@ -653,6 +653,8 @@ def test_panicking_worker():
653
653
  _ = fetch_shard(torch.ones(2, 3)).result()
654
654
 
655
655
 
656
+ # TODO - re-enable after resolving T232206970
657
+ @pytest.mark.oss_skip
656
658
  def test_timeout_warning(caplog):
657
659
  timeout = 3
658
660
  with local_rust_device_mesh(