PyPI - torchx-nightly - Versions diffs - 2024.2.12__py3-none-any.whl → 2025.1.14__py3-none-any.whl - Mend

torchx-nightly 2024.2.12py3-none-any.whl → 2025.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (102) hide show

torchx/__init__.py +2 -0
torchx/apps/serve/serve.py +2 -0
torchx/apps/utils/booth_main.py +2 -0
torchx/apps/utils/copy_main.py +2 -0
torchx/apps/utils/process_monitor.py +2 -0
torchx/cli/__init__.py +2 -0
torchx/cli/argparse_util.py +38 -3
torchx/cli/cmd_base.py +2 -0
torchx/cli/cmd_cancel.py +2 -0
torchx/cli/cmd_configure.py +2 -0
torchx/cli/cmd_describe.py +2 -0
torchx/cli/cmd_list.py +2 -0
torchx/cli/cmd_log.py +6 -24
torchx/cli/cmd_run.py +30 -12
torchx/cli/cmd_runopts.py +2 -0
torchx/cli/cmd_status.py +2 -0
torchx/cli/cmd_tracker.py +2 -0
torchx/cli/colors.py +2 -0
torchx/cli/main.py +2 -0
torchx/components/__init__.py +2 -0
torchx/components/component_test_base.py +2 -0
torchx/components/dist.py +2 -0
torchx/components/integration_tests/component_provider.py +2 -0
torchx/components/integration_tests/integ_tests.py +2 -0
torchx/components/serve.py +2 -0
torchx/components/structured_arg.py +2 -0
torchx/components/utils.py +2 -0
torchx/examples/apps/datapreproc/datapreproc.py +2 -0
torchx/examples/apps/lightning/data.py +5 -3
torchx/examples/apps/lightning/model.py +2 -0
torchx/examples/apps/lightning/profiler.py +7 -4
torchx/examples/apps/lightning/train.py +2 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +2 -0
torchx/examples/pipelines/kfp/dist_pipeline.py +3 -1
torchx/examples/pipelines/kfp/intro_pipeline.py +3 -1
torchx/examples/torchx_out_of_sync_training.py +11 -0
torchx/notebook.py +2 -0
torchx/pipelines/kfp/__init__.py +2 -0
torchx/pipelines/kfp/adapter.py +7 -4
torchx/pipelines/kfp/version.py +2 -0
torchx/runner/__init__.py +2 -0
torchx/runner/api.py +78 -20
torchx/runner/config.py +34 -3
torchx/runner/events/__init__.py +37 -3
torchx/runner/events/api.py +13 -2
torchx/runner/events/handlers.py +2 -0
torchx/runtime/tracking/__init__.py +2 -0
torchx/runtime/tracking/api.py +2 -0
torchx/schedulers/__init__.py +10 -5
torchx/schedulers/api.py +3 -1
torchx/schedulers/aws_batch_scheduler.py +4 -0
torchx/schedulers/aws_sagemaker_scheduler.py +596 -0
torchx/schedulers/devices.py +17 -4
torchx/schedulers/docker_scheduler.py +38 -8
torchx/schedulers/gcp_batch_scheduler.py +8 -9
torchx/schedulers/ids.py +2 -0
torchx/schedulers/kubernetes_mcad_scheduler.py +3 -1
torchx/schedulers/kubernetes_scheduler.py +31 -5
torchx/schedulers/local_scheduler.py +45 -6
torchx/schedulers/lsf_scheduler.py +3 -1
torchx/schedulers/ray/ray_driver.py +7 -7
torchx/schedulers/ray_scheduler.py +1 -1
torchx/schedulers/slurm_scheduler.py +3 -1
torchx/schedulers/streams.py +2 -0
torchx/specs/__init__.py +49 -8
torchx/specs/api.py +87 -5
torchx/specs/builders.py +61 -19
torchx/specs/file_linter.py +8 -2
torchx/specs/finder.py +2 -0
torchx/specs/named_resources_aws.py +109 -2
torchx/specs/named_resources_generic.py +2 -0
torchx/specs/test/components/__init__.py +2 -0
torchx/specs/test/components/a/__init__.py +2 -0
torchx/specs/test/components/a/b/__init__.py +2 -0
torchx/specs/test/components/a/b/c.py +2 -0
torchx/specs/test/components/c/__init__.py +2 -0
torchx/specs/test/components/c/d.py +2 -0
torchx/tracker/__init__.py +2 -0
torchx/tracker/api.py +4 -4
torchx/tracker/backend/fsspec.py +2 -0
torchx/util/cuda.py +2 -0
torchx/util/datetime.py +2 -0
torchx/util/entrypoints.py +6 -2
torchx/util/io.py +2 -0
torchx/util/log_tee_helpers.py +210 -0
torchx/util/modules.py +2 -0
torchx/util/session.py +42 -0
torchx/util/shlex.py +2 -0
torchx/util/strings.py +2 -0
torchx/util/types.py +20 -2
torchx/version.py +3 -1
torchx/workspace/__init__.py +2 -0
torchx/workspace/api.py +34 -1
torchx/workspace/dir_workspace.py +2 -0
torchx/workspace/docker_workspace.py +25 -2
{torchx_nightly-2024.2.12.dist-info → torchx_nightly-2025.1.14.dist-info}/METADATA +55 -48
torchx_nightly-2025.1.14.dist-info/RECORD +123 -0
{torchx_nightly-2024.2.12.dist-info → torchx_nightly-2025.1.14.dist-info}/WHEEL +1 -1
{torchx_nightly-2024.2.12.dist-info → torchx_nightly-2025.1.14.dist-info}/entry_points.txt +0 -1
torchx_nightly-2024.2.12.dist-info/RECORD +0 -119
{torchx_nightly-2024.2.12.dist-info → torchx_nightly-2025.1.14.dist-info}/LICENSE +0 -0
{torchx_nightly-2024.2.12.dist-info → torchx_nightly-2025.1.14.dist-info}/top_level.txt +0 -0

torchx/util/entrypoints.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 from typing import Any, Dict, Optional
 import importlib_metadata as metadata
@@ -49,8 +51,7 @@ def _defer_load_ep(ep: EntryPoint) -> object:
 # pyre-ignore-all-errors[3, 2]
 def load_group(
-    group: str,
-    default: Optional[Dict[str, Any]] = None,
+    group: str, default: Optional[Dict[str, Any]] = None, skip_defaults: bool = False
 ):
     """
     Loads all the entry points specified by ``group`` and returns
@@ -70,6 +71,7 @@ def load_group(
     1. ``load_group("foo")["bar"]("baz")`` -> equivalent to calling ``this.is.a_fn("baz")``
     1. ``load_group("food")`` -> ``None``
     1. ``load_group("food", default={"hello": this.is.c_fn})["hello"]("world")`` -> equivalent to calling ``this.is.c_fn("world")``
+    1. ``load_group("food", default={"hello": this.is.c_fn}, skip_defaults=True)`` -> ``None``
     If the entrypoint is a module (versus a function as shown above), then calling the ``deferred_load_fn``
@@ -88,6 +90,8 @@ def load_group(
     entrypoints = metadata.entry_points().select(group=group)
     if len(entrypoints) == 0:
+        if skip_defaults:
+            return None
         return default
     eps = {}

torchx/util/io.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 from os import path
 from pathlib import Path
 from typing import Optional

torchx/util/log_tee_helpers.py ADDED Viewed

@@ -0,0 +1,210 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+"""
+If you're wrapping the TorchX API with your own CLI, these functions can
+help show the logs of the job within your CLI, just like
+`torchx log`
+"""
+import logging
+import threading
+from queue import Queue
+from typing import List, Optional, TextIO, Tuple, TYPE_CHECKING
+from torchx.util.types import none_throws
+if TYPE_CHECKING:
+    from torchx.runner.api import Runner
+    from torchx.schedulers.api import Stream
+    from torchx.specs.api import AppDef
+logger: logging.Logger = logging.getLogger(__name__)
+# A torchX job can have stderr/stdout for many replicas, of many roles
+# The scheduler API has functions that allow us to get,
+# with unspecified detail, the log lines of a given replica of
+# a given role.
+#
+# So, to neatly tee the results, we:
+# 1) Determine every role ID / replica ID pair we want to monitor
+# 2) Request the given stderr / stdout / combined streams from them (1 thread each)
+# 3) Concatenate each of those streams to a given destination file
+def _find_role_replicas(
+    app: "AppDef",
+    role_name: Optional[str],
+) -> List[Tuple[str, int]]:
+    """
+    Enumerate all (role, replica id) pairs in the given AppDef.
+    Replica IDs are 0-indexed, and range up to num_replicas,
+    for each role.
+    If role_name is provided, filters to only that name.
+    """
+    role_replicas = []
+    for role in app.roles:
+        if role_name is None or role_name == role.name:
+            for i in range(role.num_replicas):
+                role_replicas.append((role.name, i))
+    return role_replicas
+def _prefix_line(prefix: str, line: str) -> str:
+    """
+    _prefix_line ensure the prefix is still present even when dealing with return characters
+    """
+    if "\r" in line:
+        line = line.replace("\r", f"\r{prefix}")
+    if "\n" in line[:-1]:
+        line = line[:-1].replace("\n", f"\n{prefix}") + line[-1:]
+    if not line.startswith("\r"):
+        line = f"{prefix}{line}"
+    return line
+def _print_log_lines_for_role_replica(
+    dst: TextIO,
+    app_handle: str,
+    regex: Optional[str],
+    runner: "Runner",
+    which_role: str,
+    which_replica: int,
+    exceptions: "Queue[Exception]",
+    should_tail: bool,
+    streams: Optional["Stream"],
+    colorize: bool = False,
+) -> None:
+    """
+    Helper function that'll run in parallel - one
+    per monitored replica of a given role.
+    Based on print_log_lines .. but not designed for TTY
+    """
+    try:
+        for line in runner.log_lines(
+            app_handle,
+            which_role,
+            which_replica,
+            regex,
+            should_tail=should_tail,
+            streams=streams,
+        ):
+            if colorize:
+                color_begin = "\033[32m"
+                color_end = "\033[0m"
+            else:
+                color_begin = ""
+                color_end = ""
+            prefix = f"{color_begin}{which_role}/{which_replica}{color_end} "
+            print(_prefix_line(prefix, line.strip()), file=dst, end="\n", flush=True)
+    except Exception as e:
+        exceptions.put(e)
+        raise
+def _start_threads_to_monitor_role_replicas(
+    dst: TextIO,
+    app_handle: str,
+    regex: Optional[str],
+    runner: "Runner",
+    which_role: Optional[str] = None,
+    should_tail: bool = False,
+    streams: Optional["Stream"] = None,
+    colorize: bool = False,
+) -> None:
+    threads = []
+    app = none_throws(runner.describe(app_handle))
+    replica_ids = _find_role_replicas(app, role_name=which_role)
+    # Holds exceptions raised by all threads, in a thread-safe
+    # object
+    exceptions = Queue()
+    if not replica_ids:
+        valid_roles = [role.name for role in app.roles]
+        raise ValueError(
+            f"{which_role} is not a valid role name. Available: {valid_roles}"
+        )
+    for role_name, replica_id in replica_ids:
+        threads.append(
+            threading.Thread(
+                target=_print_log_lines_for_role_replica,
+                kwargs={
+                    "dst": dst,
+                    "runner": runner,
+                    "app_handle": app_handle,
+                    "which_role": role_name,
+                    "which_replica": replica_id,
+                    "regex": regex,
+                    "should_tail": should_tail,
+                    "exceptions": exceptions,
+                    "streams": streams,
+                    "colorize": colorize,
+                },
+                daemon=True,
+            )
+        )
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+    # Retrieve all exceptions, print all except one and raise the first recorded exception
+    threads_exceptions = []
+    while not exceptions.empty():
+        threads_exceptions.append(exceptions.get())
+    if len(threads_exceptions) > 0:
+        for i in range(1, len(threads_exceptions)):
+            logger.error(threads_exceptions[i])
+        raise threads_exceptions[0]
+def tee_logs(
+    dst: TextIO,
+    app_handle: str,
+    regex: Optional[str],
+    runner: "Runner",
+    should_tail: bool = False,
+    streams: Optional["Stream"] = None,
+    colorize: bool = False,
+) -> threading.Thread:
+    """
+    Makes a thread, which in turn will start 1 thread per replica
+    per role, that tees that role-replica's logs to the given
+    destination buffer.
+    You'll need to start and join with this parent thread.
+    dst:  TextIO to tee the logs into
+    app_handle: The return value of runner.run() or runner.schedule()
+    regex: Regex to filter the logs that are tee-d
+    runner: The Runner you used to schedule the job
+    should_tail: If true, continue until we run out of logs. Otherwise, just fetch
+                 what's available
+    streams: Whether to fetch STDERR, STDOUT, or the temporally COMBINED (default) logs
+    """
+    thread = threading.Thread(
+        target=_start_threads_to_monitor_role_replicas,
+        kwargs={
+            "dst": dst,
+            "runner": runner,
+            "app_handle": app_handle,
+            "regex": None,
+            "should_tail": True,
+            "colorize": colorize,
+        },
+        daemon=True,
+    )
+    return thread

torchx/util/modules.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import importlib
 from types import ModuleType
 from typing import Callable, Optional, Union

torchx/util/session.py ADDED Viewed

@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import os
+import uuid
+from typing import Optional
+TORCHX_INTERNAL_SESSION_ID = "TORCHX_INTERNAL_SESSION_ID"
+CURRENT_SESSION_ID: Optional[str] = None
+def get_session_id_or_create_new() -> str:
+    """
+    Returns the current session ID, or creates a new one if none exists.
+    The session ID remains the same as long as it is in the same process.
+    Please DO NOT use this function out of torchx codebase.
+    """
+    global CURRENT_SESSION_ID
+    if CURRENT_SESSION_ID:
+        return CURRENT_SESSION_ID
+    env_session_id = os.getenv(TORCHX_INTERNAL_SESSION_ID)
+    if env_session_id:
+        CURRENT_SESSION_ID = env_session_id
+        return CURRENT_SESSION_ID
+    session_id = str(uuid.uuid4())
+    CURRENT_SESSION_ID = session_id
+    return session_id
+def get_torchx_session_id() -> Optional[str]:
+    """
+    Returns the torchx session ID.
+    Please use this function to get the session ID out of torchx codebase.
+    """
+    return CURRENT_SESSION_ID

torchx/util/shlex.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import shlex
 from typing import Iterable

torchx/util/strings.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import re

torchx/util/types.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
@@ -43,6 +45,9 @@ def to_dict(arg: str) -> Dict[str, str]:
      to_dict("FOO=v1") == {"FOO": "v1"}
+     to_dict("FOO=''") == {"FOO": ""}
+     to_dict('FOO=""') == {"FOO": ""}
      to_dict("FOO=v1,v2") == {"FOO": "v1,v2"]}
      to_dict("FOO=v1;v2") == {"FOO": "v1;v2"]}
      to_dict("FOO=v1;v2") == {"FOO": "v1;v2,"]}
@@ -68,6 +73,9 @@ def to_dict(arg: str) -> Dict[str, str]:
         else:
             return vk[0:idx].strip(), vk[idx + 1 :].strip()
+    def to_val(val: str) -> str:
+        return val if val != '""' and val != "''" else ""
     arg_map: Dict[str, str] = {}
     if not arg:
@@ -90,10 +98,10 @@ def to_dict(arg: str) -> Dict[str, str]:
     # middle elements are value_{n}<delim>key_{n+1}
     for vk in split_arg[1 : split_arg_len - 1]:  # python deals with
         val, key_next = parse_val_key(vk)
-        arg_map[key] = val
+        arg_map[key] = to_val(val)
         key = key_next
     val = split_arg[-1]  # last element is always a value
-    arg_map[key] = val
+    arg_map[key] = to_val(val)
     return arg_map
@@ -120,6 +128,16 @@ def _decode_string_to_list(
     return arg_values
+def decode(encoded_value: Any, annotation: Any):
+    if encoded_value is None:
+        return None
+    if is_bool(annotation):
+        return encoded_value and encoded_value.lower() == "true"
+    if not is_primitive(annotation) and type(encoded_value) == str:
+        return decode_from_string(encoded_value, annotation)
+    return encoded_value
 def decode_from_string(
     encoded_value: str, annotation: Any
 ) -> Union[Dict[Any, Any], List[Any], None]:

torchx/version.py CHANGED Viewed

@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 from torchx.util.entrypoints import load
 # Follows PEP-0440 version scheme guidelines
@@ -16,7 +18,7 @@ from torchx.util.entrypoints import load
 # 0.1.0bN  # Beta release
 # 0.1.0rcN  # Release Candidate
 # 0.1.0  # Final release
-__version__ = "0.7.0dev0"
+__version__ = "0.8.0dev0"
 # Use the github container registry images corresponding to the current package

torchx/workspace/__init__.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 """
 Status: Beta

torchx/workspace/api.py CHANGED Viewed

@@ -4,10 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import abc
 import fnmatch
 import posixpath
-from typing import Generic, Iterable, Mapping, Tuple, TYPE_CHECKING, TypeVar
+from dataclasses import dataclass
+from typing import Any, Dict, Generic, Iterable, Mapping, Tuple, TYPE_CHECKING, TypeVar
 from torchx.specs import AppDef, CfgVal, Role, runopts
@@ -18,6 +21,36 @@ TORCHX_IGNORE = ".torchxignore"
 T = TypeVar("T")
+PackageType = TypeVar("PackageType")
+WorkspaceConfigType = TypeVar("WorkspaceConfigType")
+@dataclass
+class PkgInfo(Generic[PackageType]):
+    """
+    Convenience class used to specify information regarding the built workspace
+    """
+    img: str
+    lazy_overrides: Dict[str, Any]
+    metadata: PackageType
+@dataclass
+class WorkspaceBuilder(Generic[PackageType, WorkspaceConfigType]):
+    cfg: WorkspaceConfigType
+    @abc.abstractmethod
+    def build_workspace(self, sync: bool = True) -> PkgInfo[PackageType]:
+        """
+        Builds the specified ``workspace`` with respect to ``img``.
+        In the simplest case, this method builds a new image.
+        Certain (more efficient) implementations build
+        incremental diff patches that overlay on top of the role's image.
+        """
+        pass
 class WorkspaceMixin(abc.ABC, Generic[T]):
     """

torchx/workspace/dir_workspace.py CHANGED Viewed

@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import os
 import posixpath
 import shutil

torchx/workspace/docker_workspace.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import io
 import logging
 import posixpath
@@ -16,6 +18,7 @@ from typing import Dict, IO, Iterable, Mapping, Optional, TextIO, Tuple, TYPE_CH
 import fsspec
 import torchx
+from docker.errors import BuildError
 from torchx.specs import AppDef, CfgVal, Role, runopts
 from torchx.workspace.api import walk_workspace, WorkspaceMixin
@@ -91,6 +94,12 @@ class DockerWorkspaceMixin(WorkspaceMixin[Dict[str, Tuple[str, str]]]):
             type_=str,
             help="(remote jobs) the image repository to use when pushing patched images, must have push access. Ex: example.com/your/container",
         )
+        opts.add(
+            "quiet",
+            type_=bool,
+            default=False,
+            help="whether to suppress verbose output for image building. Defaults to ``False``.",
+        )
         return opts
     def build_workspace_and_update_role(
@@ -119,7 +128,7 @@ class DockerWorkspaceMixin(WorkspaceMixin[Dict[str, Tuple[str, str]]]):
                     f"failed to pull image {role.image}, falling back to local: {e}"
                 )
             log.info("Building workspace docker image (this may take a while)...")
-            image, _ = self._docker_client.images.build(
+            build_events = self._docker_client.api.build(
                 fileobj=context,
                 custom_context=True,
                 dockerfile=TORCHX_DOCKERFILE,
@@ -129,12 +138,26 @@ class DockerWorkspaceMixin(WorkspaceMixin[Dict[str, Tuple[str, str]]]):
                 },
                 pull=False,
                 rm=True,
+                decode=True,
                 labels={
                     self.LABEL_VERSION: torchx.__version__,
                 },
             )
+            image_id = None
+            for event in build_events:
+                if message := event.get("stream"):
+                    if not cfg.get("quiet", False):
+                        message = message.strip("\r\n").strip("\n")
+                        if message:
+                            log.info(message)
+                if aux := event.get("aux"):
+                    image_id = aux["ID"]
+                if error := event.get("error"):
+                    raise BuildError(reason=error, build_log=None)
             if len(old_imgs) == 0 or role.image not in old_imgs:
-                role.image = image.id
+                assert image_id, "image id was not found"
+                role.image = image_id
         finally:
             context.close()

torchx-nightly 2024.2.12__py3-none-any.whl → 2025.1.14__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2024.2.12py3-none-any.whl → 2025.1.14py3-none-any.whl