torchx-nightly 2024.9.8__py3-none-any.whl → 2024.9.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

torchx/cli/cmd_log.py CHANGED
@@ -23,6 +23,10 @@ from torchx.runner import get_runner, Runner
23
23
  from torchx.schedulers.api import Stream
24
24
  from torchx.specs.api import is_started
25
25
  from torchx.specs.builders import make_app_handle
26
+ from torchx.util.log_tee_helpers import (
27
+ _find_role_replicas as find_role_replicas,
28
+ _prefix_line,
29
+ )
26
30
 
27
31
  from torchx.util.types import none_throws
28
32
 
@@ -39,19 +43,6 @@ def validate(job_identifier: str) -> None:
39
43
  sys.exit(1)
40
44
 
41
45
 
42
- def _prefix_line(prefix: str, line: str) -> str:
43
- """
44
- _prefix_line ensure the prefix is still present even when dealing with return characters
45
- """
46
- if "\r" in line:
47
- line = line.replace("\r", f"\r{prefix}")
48
- if "\n" in line[:-1]:
49
- line = line[:-1].replace("\n", f"\n{prefix}") + line[-1:]
50
- if not line.startswith("\r"):
51
- line = f"{prefix}{line}"
52
- return line
53
-
54
-
55
46
  def print_log_lines(
56
47
  file: TextIO,
57
48
  runner: Runner,
@@ -167,17 +158,6 @@ def get_logs(
167
158
  raise threads_exceptions[0]
168
159
 
169
160
 
170
- def find_role_replicas(
171
- app: specs.AppDef, role_name: Optional[str]
172
- ) -> List[Tuple[str, int]]:
173
- role_replicas = []
174
- for role in app.roles:
175
- if role_name is None or role_name == role.name:
176
- for i in range(role.num_replicas):
177
- role_replicas.append((role.name, i))
178
- return role_replicas
179
-
180
-
181
161
  class CmdLog(SubCommand):
182
162
  def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
183
163
  subparser.add_argument(
torchx/cli/cmd_run.py CHANGED
@@ -21,7 +21,6 @@ from typing import Dict, List, Optional, Tuple
21
21
  import torchx.specs as specs
22
22
  from torchx.cli.argparse_util import ArgOnceAction, torchxconfig_run
23
23
  from torchx.cli.cmd_base import SubCommand
24
- from torchx.cli.cmd_log import get_logs
25
24
  from torchx.runner import config, get_runner, Runner
26
25
  from torchx.runner.config import load_sections
27
26
  from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories
@@ -32,6 +31,7 @@ from torchx.specs.finder import (
32
31
  get_builtin_source,
33
32
  get_components,
34
33
  )
34
+ from torchx.util.log_tee_helpers import tee_logs
35
35
  from torchx.util.types import none_throws
36
36
 
37
37
 
@@ -288,16 +288,14 @@ class CmdRun(SubCommand):
288
288
  logger.debug(status)
289
289
 
290
290
  def _start_log_thread(self, runner: Runner, app_handle: str) -> threading.Thread:
291
- thread = threading.Thread(
292
- target=get_logs,
293
- kwargs={
294
- "file": sys.stderr,
295
- "runner": runner,
296
- "identifier": app_handle,
297
- "regex": None,
298
- "should_tail": True,
299
- },
291
+ thread = tee_logs(
292
+ dst=sys.stderr,
293
+ app_handle=app_handle,
294
+ regex=None,
295
+ runner=runner,
296
+ should_tail=True,
297
+ streams=None,
298
+ colorize=not sys.stderr.closed and sys.stderr.isatty(),
300
299
  )
301
- thread.daemon = True
302
300
  thread.start()
303
301
  return thread
@@ -27,6 +27,7 @@ from types import TracebackType
27
27
  from typing import Dict, Optional, Type
28
28
 
29
29
  from torchx.runner.events.handlers import get_logging_handler
30
+ from torchx.util.session import get_session_id_or_create_new
30
31
 
31
32
  from .api import SourceType, TorchxEvent # noqa F401
32
33
 
@@ -136,7 +137,7 @@ class log_event:
136
137
  workspace: Optional[str] = None,
137
138
  ) -> TorchxEvent:
138
139
  return TorchxEvent(
139
- session=app_id or "",
140
+ session=get_session_id_or_create_new(),
140
141
  scheduler=scheduler,
141
142
  api=api,
142
143
  app_id=app_id,
@@ -25,7 +25,7 @@ class TorchxEvent:
25
25
  The class represents the event produced by ``torchx.runner`` api calls.
26
26
 
27
27
  Arguments:
28
- session: Session id that was used to execute request.
28
+ session: Session id of the current run
29
29
  scheduler: Scheduler that is used to execute request
30
30
  api: Api name
31
31
  app_id: Unique id that is set by the underlying scheduler
@@ -0,0 +1,210 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ """
10
+ If you're wrapping the TorchX API with your own CLI, these functions can
11
+ help show the logs of the job within your CLI, just like
12
+ `torchx log`
13
+ """
14
+
15
+ import logging
16
+ import threading
17
+ from queue import Queue
18
+ from typing import List, Optional, TextIO, Tuple, TYPE_CHECKING
19
+
20
+ from torchx.util.types import none_throws
21
+
22
+ if TYPE_CHECKING:
23
+ from torchx.runner.api import Runner
24
+ from torchx.schedulers.api import Stream
25
+ from torchx.specs.api import AppDef
26
+
27
+ logger: logging.Logger = logging.getLogger(__name__)
28
+
29
+ # A torchX job can have stderr/stdout for many replicas, of many roles
30
+ # The scheduler API has functions that allow us to get,
31
+ # with unspecified detail, the log lines of a given replica of
32
+ # a given role.
33
+ #
34
+ # So, to neatly tee the results, we:
35
+ # 1) Determine every role ID / replica ID pair we want to monitor
36
+ # 2) Request the given stderr / stdout / combined streams from them (1 thread each)
37
+ # 3) Concatenate each of those streams to a given destination file
38
+
39
+
40
+ def _find_role_replicas(
41
+ app: "AppDef",
42
+ role_name: Optional[str],
43
+ ) -> List[Tuple[str, int]]:
44
+ """
45
+ Enumerate all (role, replica id) pairs in the given AppDef.
46
+ Replica IDs are 0-indexed, and range up to num_replicas,
47
+ for each role.
48
+ If role_name is provided, filters to only that name.
49
+ """
50
+ role_replicas = []
51
+ for role in app.roles:
52
+ if role_name is None or role_name == role.name:
53
+ for i in range(role.num_replicas):
54
+ role_replicas.append((role.name, i))
55
+ return role_replicas
56
+
57
+
58
+ def _prefix_line(prefix: str, line: str) -> str:
59
+ """
60
+ _prefix_line ensure the prefix is still present even when dealing with return characters
61
+ """
62
+ if "\r" in line:
63
+ line = line.replace("\r", f"\r{prefix}")
64
+ if "\n" in line[:-1]:
65
+ line = line[:-1].replace("\n", f"\n{prefix}") + line[-1:]
66
+ if not line.startswith("\r"):
67
+ line = f"{prefix}{line}"
68
+ return line
69
+
70
+
71
+ def _print_log_lines_for_role_replica(
72
+ dst: TextIO,
73
+ app_handle: str,
74
+ regex: Optional[str],
75
+ runner: "Runner",
76
+ which_role: str,
77
+ which_replica: int,
78
+ exceptions: "Queue[Exception]",
79
+ should_tail: bool,
80
+ streams: Optional["Stream"],
81
+ colorize: bool = False,
82
+ ) -> None:
83
+ """
84
+ Helper function that'll run in parallel - one
85
+ per monitored replica of a given role.
86
+
87
+ Based on print_log_lines .. but not designed for TTY
88
+ """
89
+ try:
90
+ for line in runner.log_lines(
91
+ app_handle,
92
+ which_role,
93
+ which_replica,
94
+ regex,
95
+ should_tail=should_tail,
96
+ streams=streams,
97
+ ):
98
+ if colorize:
99
+ color_begin = "\033[32m"
100
+ color_end = "\033[0m"
101
+ else:
102
+ color_begin = ""
103
+ color_end = ""
104
+ prefix = f"{color_begin}{which_role}/{which_replica}{color_end} "
105
+ print(_prefix_line(prefix, line), file=dst, end="", flush=True)
106
+ except Exception as e:
107
+ exceptions.put(e)
108
+ raise
109
+
110
+
111
+ def _start_threads_to_monitor_role_replicas(
112
+ dst: TextIO,
113
+ app_handle: str,
114
+ regex: Optional[str],
115
+ runner: "Runner",
116
+ which_role: Optional[str] = None,
117
+ should_tail: bool = False,
118
+ streams: Optional["Stream"] = None,
119
+ colorize: bool = False,
120
+ ) -> None:
121
+ threads = []
122
+
123
+ app = none_throws(runner.describe(app_handle))
124
+ replica_ids = _find_role_replicas(app, role_name=which_role)
125
+
126
+ # Holds exceptions raised by all threads, in a thread-safe
127
+ # object
128
+ exceptions = Queue()
129
+
130
+ if not replica_ids:
131
+ valid_roles = [role.name for role in app.roles]
132
+ raise ValueError(
133
+ f"{which_role} is not a valid role name. Available: {valid_roles}"
134
+ )
135
+
136
+ for role_name, replica_id in replica_ids:
137
+ threads.append(
138
+ threading.Thread(
139
+ target=_print_log_lines_for_role_replica,
140
+ kwargs={
141
+ "dst": dst,
142
+ "runner": runner,
143
+ "app_handle": app_handle,
144
+ "which_role": role_name,
145
+ "which_replica": replica_id,
146
+ "regex": regex,
147
+ "should_tail": should_tail,
148
+ "exceptions": exceptions,
149
+ "streams": streams,
150
+ "colorize": colorize,
151
+ },
152
+ daemon=True,
153
+ )
154
+ )
155
+
156
+ for t in threads:
157
+ t.start()
158
+
159
+ for t in threads:
160
+ t.join()
161
+
162
+ # Retrieve all exceptions, print all except one and raise the first recorded exception
163
+ threads_exceptions = []
164
+ while not exceptions.empty():
165
+ threads_exceptions.append(exceptions.get())
166
+
167
+ if len(threads_exceptions) > 0:
168
+ for i in range(1, len(threads_exceptions)):
169
+ logger.error(threads_exceptions[i])
170
+
171
+ raise threads_exceptions[0]
172
+
173
+
174
+ def tee_logs(
175
+ dst: TextIO,
176
+ app_handle: str,
177
+ regex: Optional[str],
178
+ runner: "Runner",
179
+ should_tail: bool = False,
180
+ streams: Optional["Stream"] = None,
181
+ colorize: bool = False,
182
+ ) -> threading.Thread:
183
+ """
184
+ Makes a thread, which in turn will start 1 thread per replica
185
+ per role, that tees that role-replica's logs to the given
186
+ destination buffer.
187
+
188
+ You'll need to start and join with this parent thread.
189
+
190
+ dst: TextIO to tee the logs into
191
+ app_handle: The return value of runner.run() or runner.schedule()
192
+ regex: Regex to filter the logs that are tee-d
193
+ runner: The Runner you used to schedule the job
194
+ should_tail: If true, continue until we run out of logs. Otherwise, just fetch
195
+ what's available
196
+ streams: Whether to fetch STDERR, STDOUT, or the temporally COMBINED (default) logs
197
+ """
198
+ thread = threading.Thread(
199
+ target=_start_threads_to_monitor_role_replicas,
200
+ kwargs={
201
+ "dst": dst,
202
+ "runner": runner,
203
+ "app_handle": app_handle,
204
+ "regex": None,
205
+ "should_tail": True,
206
+ "colorize": colorize,
207
+ },
208
+ daemon=True,
209
+ )
210
+ return thread
torchx/util/session.py ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import uuid
11
+ from typing import Optional
12
+
13
+ CURRENT_SESSION_ID: Optional[str] = None
14
+
15
+
16
+ def get_session_id_or_create_new() -> str:
17
+ """
18
+ Returns the current session ID, or creates a new one if none exists.
19
+ The session ID remains the same as long as it is in the same process.
20
+ """
21
+ global CURRENT_SESSION_ID
22
+ if CURRENT_SESSION_ID:
23
+ return CURRENT_SESSION_ID
24
+ session_id = str(uuid.uuid4())
25
+ CURRENT_SESSION_ID = session_id
26
+ return session_id
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2024.9.8
3
+ Version: 2024.9.13
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/pytorch/torchx
6
6
  Author: TorchX Devs
@@ -62,6 +62,7 @@ Requires-Dist: torchtext==0.17.1; extra == "dev"
62
62
  Requires-Dist: torchvision==0.17.1; extra == "dev"
63
63
  Requires-Dist: ts==0.5.1; extra == "dev"
64
64
  Requires-Dist: ray[default]; extra == "dev"
65
+ Requires-Dist: wheel; extra == "dev"
65
66
  Requires-Dist: lintrunner; extra == "dev"
66
67
  Requires-Dist: lintrunner-adapters; extra == "dev"
67
68
  Requires-Dist: grpcio==1.62.1; extra == "dev"
@@ -15,8 +15,8 @@ torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
15
15
  torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
16
16
  torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
17
17
  torchx/cli/cmd_list.py,sha256=BVqHEW2oTEJ3GqcFK7c1K-i2R-DUjaXQ-WBr0meeIGM,1429
18
- torchx/cli/cmd_log.py,sha256=Xh5vrsbwyV_ppwurrENGBNKxc1XLVbFC6YH1b8jlHAM,6104
19
- torchx/cli/cmd_run.py,sha256=T2TmkVZNbIljCEmVQHAQjzVIkHkOH_BV5dlY_4ErrUs,11220
18
+ torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
19
+ torchx/cli/cmd_run.py,sha256=OYAp0Rp_YxdS8FLrRxoWnYX24VIsNXaGiUHAyisEXYI,11185
20
20
  torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,1302
21
21
  torchx/cli/cmd_status.py,sha256=ubtmCp4PylrIh_kC3ZJ5QJm7lzXRt_aRPmY7j-sZu_0,1836
22
22
  torchx/cli/cmd_tracker.py,sha256=RfLxE4Cq1wfk7k051RtZ8RPJp0pEKSCa3KmTeRs3LF8,5218
@@ -58,8 +58,8 @@ torchx/pipelines/kfp/version.py,sha256=mYBxd6bm4MeR34D--xo-JLQ9wHeAl_ZQLwbItCf9t
58
58
  torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
59
59
  torchx/runner/api.py,sha256=17ZFEYeuIK0kUrTUysdgsFaEVc6_-ggvBuSoabQnrcA,28625
60
60
  torchx/runner/config.py,sha256=fTdCcf-MKlBg6MzXopF4W0hYyDDoPAuvZs2v2bKzwG0,17849
61
- torchx/runner/events/__init__.py,sha256=KxvodAlq9ZDUIKzdlzHuyQWa0aoIGKTtBfE7vjM6GRk,4570
62
- torchx/runner/events/api.py,sha256=UTdExV7cMR5kC0nQIViwTlyblhBF3D5bbuqd4-kWrkE,2542
61
+ torchx/runner/events/__init__.py,sha256=TtzBLZ9oaHKfr689R4NnjCii3G8kxiRafe7Q0jRE5_k,4649
62
+ torchx/runner/events/api.py,sha256=_6mjS5B5FcSMEea1U0M-fD4jErmbpZ1gQOyUHV-4dqY,2527
63
63
  torchx/runner/events/handlers.py,sha256=ThHCIJW21BfBgB7b6ftyjASJmD1KdizpjuTtsyqnvJs,522
64
64
  torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,593
65
65
  torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
@@ -105,7 +105,9 @@ torchx/util/cuda.py,sha256=-ZTa1WCLnY2WtSWAdWufLQqZSDCZfZsloBuiS84LIkU,1099
105
105
  torchx/util/datetime.py,sha256=hV6Sg0u5KTBe68yrmy_RGCC5su0i4Tb_mAYphWamiXI,405
106
106
  torchx/util/entrypoints.py,sha256=4rqmA81XYLj4Kk7GboJi0z78h4NIQxSrcOzDuuTwCkw,2725
107
107
  torchx/util/io.py,sha256=HNpWLcFUX0WTAP3CsdamHz--FR5A4kSdLCPfNqa2UkA,1807
108
+ torchx/util/log_tee_helpers.py,sha256=yJ4ODU3QE0t7ZgYHrE-HiwlOhfg8BK1e4oVaw_zMa2E,6320
108
109
  torchx/util/modules.py,sha256=LRTuZRH5bbRr0ZaCtCtvKbgwhMoPsTx-GokWbCLGPdk,1131
110
+ torchx/util/session.py,sha256=__4DFoJqpTzqc99FPlKyLPDQFn2g3rtu0dFb5b-Z-7E,721
109
111
  torchx/util/shlex.py,sha256=eXEKu8KC3zIcd8tEy9_s8Ds5oma8BORr-0VGWNpG2dk,463
110
112
  torchx/util/strings.py,sha256=GkLWCmYS89Uv6bWc5hH0XwvHy7oQmprv2U7axC4A2e8,678
111
113
  torchx/util/types.py,sha256=een55pV-N8aBc3qUBjHRc1llJcX10JVa19pB8dBE8No,7564
@@ -113,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
113
115
  torchx/workspace/api.py,sha256=1heBmPgB-W5Zf9gwViM7NrqvHpZlVYeMN7jpY8Qkytc,5479
114
116
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
115
117
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
116
- torchx_nightly-2024.9.8.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
117
- torchx_nightly-2024.9.8.dist-info/METADATA,sha256=GDLO7VCHMTXs4ZwHgCRkSf6pWFBw3p08VMPhfqL7dEA,6131
118
- torchx_nightly-2024.9.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
119
- torchx_nightly-2024.9.8.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
120
- torchx_nightly-2024.9.8.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
121
- torchx_nightly-2024.9.8.dist-info/RECORD,,
118
+ torchx_nightly-2024.9.13.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
119
+ torchx_nightly-2024.9.13.dist-info/METADATA,sha256=8rzgUn_Ac9ga2suLgjBnQwrS86iCpGmcao1CC9Oixjk,6169
120
+ torchx_nightly-2024.9.13.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
121
+ torchx_nightly-2024.9.13.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
122
+ torchx_nightly-2024.9.13.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
123
+ torchx_nightly-2024.9.13.dist-info/RECORD,,