wandb 0.18.1__py3-none-macosx_11_0_arm64.whl → 0.18.2__py3-none-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -3
- wandb/__init__.pyi +67 -12
- wandb/apis/internal.py +3 -0
- wandb/apis/public/api.py +128 -2
- wandb/apis/public/artifacts.py +11 -7
- wandb/apis/public/jobs.py +8 -0
- wandb/apis/public/runs.py +16 -5
- wandb/bin/apple_gpu_stats +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/cli.py +0 -3
- wandb/errors/__init__.py +11 -40
- wandb/errors/errors.py +37 -0
- wandb/errors/warnings.py +2 -0
- wandb/integration/tensorboard/log.py +1 -1
- wandb/old/core.py +2 -80
- wandb/plot/bar.py +7 -4
- wandb/plot/confusion_matrix.py +5 -4
- wandb/plot/histogram.py +7 -4
- wandb/plot/line.py +7 -4
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/sdk/artifacts/_validators.py +48 -3
- wandb/sdk/artifacts/artifact.py +157 -183
- wandb/sdk/artifacts/artifact_file_cache.py +13 -11
- wandb/sdk/artifacts/artifact_instance_cache.py +4 -2
- wandb/sdk/artifacts/artifact_manifest.py +13 -11
- wandb/sdk/artifacts/artifact_manifest_entry.py +24 -22
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +9 -7
- wandb/sdk/artifacts/artifact_saver.py +27 -25
- wandb/sdk/artifacts/exceptions.py +26 -25
- wandb/sdk/artifacts/storage_handler.py +11 -9
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +16 -14
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +15 -13
- wandb/sdk/artifacts/storage_handlers/http_handler.py +15 -14
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +10 -8
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +14 -12
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +19 -19
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +10 -8
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +12 -10
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +9 -7
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +31 -29
- wandb/sdk/artifacts/storage_policy.py +20 -20
- wandb/sdk/backend/backend.py +8 -26
- wandb/sdk/data_types/base_types/wb_value.py +1 -3
- wandb/sdk/data_types/video.py +2 -2
- wandb/sdk/interface/interface.py +0 -24
- wandb/sdk/interface/interface_shared.py +0 -12
- wandb/sdk/internal/handler.py +0 -10
- wandb/sdk/internal/internal_api.py +71 -0
- wandb/sdk/internal/sender.py +0 -43
- wandb/sdk/internal/tb_watcher.py +1 -1
- wandb/sdk/lib/_settings_toposort_generated.py +1 -0
- wandb/sdk/lib/hashutil.py +34 -12
- wandb/sdk/lib/service_connection.py +216 -0
- wandb/sdk/lib/service_token.py +94 -0
- wandb/sdk/lib/sock_client.py +7 -3
- wandb/sdk/service/server.py +2 -5
- wandb/sdk/service/service.py +0 -22
- wandb/sdk/wandb_init.py +32 -22
- wandb/sdk/wandb_run.py +12 -7
- wandb/sdk/wandb_settings.py +2 -0
- wandb/sdk/wandb_setup.py +25 -16
- wandb/sdk/wandb_sync.py +9 -3
- wandb/sdk/wandb_watch.py +31 -15
- wandb/util.py +8 -1
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/METADATA +2 -1
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/RECORD +72 -72
- wandb/sdk/internal/update.py +0 -113
- wandb/sdk/service/service_base.py +0 -50
- wandb/sdk/service/service_sock.py +0 -70
- wandb/sdk/wandb_manager.py +0 -232
- /wandb/{sdk/lib → plot}/viz.py +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/WHEEL +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/entry_points.txt +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/licenses/LICENSE +0 -0
    
        wandb/sdk/lib/hashutil.py
    CHANGED
    
    | @@ -1,19 +1,22 @@ | |
| 1 | 
            +
            from __future__ import annotations
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            import base64
         | 
| 2 4 | 
             
            import hashlib
         | 
| 3 5 | 
             
            import mmap
         | 
| 4 | 
            -
            import os
         | 
| 5 6 | 
             
            import sys
         | 
| 6 | 
            -
            from  | 
| 7 | 
            -
            from typing import NewType, Union
         | 
| 7 | 
            +
            from typing import TYPE_CHECKING, NewType
         | 
| 8 8 |  | 
| 9 9 | 
             
            from wandb.sdk.lib.paths import StrPath
         | 
| 10 10 |  | 
| 11 | 
            +
            if TYPE_CHECKING:
         | 
| 12 | 
            +
                import _hashlib  # type: ignore[import-not-found]
         | 
| 13 | 
            +
             | 
| 11 14 | 
             
            ETag = NewType("ETag", str)
         | 
| 12 15 | 
             
            HexMD5 = NewType("HexMD5", str)
         | 
| 13 16 | 
             
            B64MD5 = NewType("B64MD5", str)
         | 
| 14 17 |  | 
| 15 18 |  | 
| 16 | 
            -
            def _md5(data: bytes = b"") ->  | 
| 19 | 
            +
            def _md5(data: bytes = b"") -> _hashlib.HASH:
         | 
| 17 20 | 
             
                """Allow FIPS-compliant md5 hash when supported."""
         | 
| 18 21 | 
             
                if sys.version_info >= (3, 9):
         | 
| 19 22 | 
             
                    return hashlib.md5(data, usedforsecurity=False)
         | 
| @@ -25,7 +28,7 @@ def md5_string(string: str) -> B64MD5: | |
| 25 28 | 
             
                return _b64_from_hasher(_md5(string.encode("utf-8")))
         | 
| 26 29 |  | 
| 27 30 |  | 
| 28 | 
            -
            def _b64_from_hasher(hasher:  | 
| 31 | 
            +
            def _b64_from_hasher(hasher: _hashlib.HASH) -> B64MD5:
         | 
| 29 32 | 
             
                return B64MD5(base64.b64encode(hasher.digest()).decode("ascii"))
         | 
| 30 33 |  | 
| 31 34 |  | 
| @@ -33,7 +36,7 @@ def b64_to_hex_id(string: B64MD5) -> HexMD5: | |
| 33 36 | 
             
                return HexMD5(base64.standard_b64decode(string).hex())
         | 
| 34 37 |  | 
| 35 38 |  | 
| 36 | 
            -
            def hex_to_b64_id(encoded_string:  | 
| 39 | 
            +
            def hex_to_b64_id(encoded_string: str | bytes) -> B64MD5:
         | 
| 37 40 | 
             
                if isinstance(encoded_string, bytes):
         | 
| 38 41 | 
             
                    encoded_string = encoded_string.decode("utf-8")
         | 
| 39 42 | 
             
                as_str = bytes.fromhex(encoded_string)
         | 
| @@ -48,15 +51,34 @@ def md5_file_hex(*paths: StrPath) -> HexMD5: | |
| 48 51 | 
             
                return HexMD5(_md5_file_hasher(*paths).hexdigest())
         | 
| 49 52 |  | 
| 50 53 |  | 
| 51 | 
            -
             | 
| 54 | 
            +
            _KB: int = 1_024
         | 
| 55 | 
            +
            _CHUNKSIZE: int = 128 * _KB
         | 
| 56 | 
            +
            """Chunk size (in bytes) for iteratively reading from file, if needed."""
         | 
| 57 | 
            +
             | 
| 58 | 
            +
             | 
| 59 | 
            +
            def _md5_file_hasher(*paths: StrPath) -> _hashlib.HASH:
         | 
| 52 60 | 
             
                md5_hash = _md5()
         | 
| 53 61 |  | 
| 54 | 
            -
                 | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
                        else:
         | 
| 62 | 
            +
                # Note: We use str paths (instead of pathlib.Path objs) for minor perf improvements.
         | 
| 63 | 
            +
                for path in sorted(map(str, paths)):
         | 
| 64 | 
            +
                    with open(path, "rb") as f:
         | 
| 65 | 
            +
                        try:
         | 
| 59 66 | 
             
                            with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mview:
         | 
| 60 67 | 
             
                                md5_hash.update(mview)
         | 
| 68 | 
            +
                        except OSError:
         | 
| 69 | 
            +
                            # This occurs if the mmap-ed file is on a different/mounted filesystem,
         | 
| 70 | 
            +
                            # so we'll fall back on a less performant implementation.
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                            # Note: At the time of implementation, the walrus operator `:=`
         | 
| 73 | 
            +
                            # is avoided to maintain support for users on python 3.7.
         | 
| 74 | 
            +
                            # Consider revisiting once 3.7 support is no longer needed.
         | 
| 75 | 
            +
                            chunk = f.read(_CHUNKSIZE)
         | 
| 76 | 
            +
                            while chunk:
         | 
| 77 | 
            +
                                md5_hash.update(chunk)
         | 
| 78 | 
            +
                                chunk = f.read(_CHUNKSIZE)
         | 
| 79 | 
            +
                        except ValueError:
         | 
| 80 | 
            +
                            # This occurs when mmap-ing an empty file, which can be skipped.
         | 
| 81 | 
            +
                            # See: https://github.com/python/cpython/blob/986a4e1b6fcae7fe7a1d0a26aea446107dd58dd2/Modules/mmapmodule.c#L1589
         | 
| 82 | 
            +
                            pass
         | 
| 61 83 |  | 
| 62 84 | 
             
                return md5_hash
         | 
| @@ -0,0 +1,216 @@ | |
| 1 | 
            +
            from __future__ import annotations
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import atexit
         | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            from typing import Callable
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from wandb.proto import wandb_internal_pb2 as pb
         | 
| 8 | 
            +
            from wandb.proto import wandb_server_pb2 as spb
         | 
| 9 | 
            +
            from wandb.proto import wandb_settings_pb2
         | 
| 10 | 
            +
            from wandb.sdk import wandb_settings
         | 
| 11 | 
            +
            from wandb.sdk.interface.interface import InterfaceBase
         | 
| 12 | 
            +
            from wandb.sdk.interface.interface_sock import InterfaceSock
         | 
| 13 | 
            +
            from wandb.sdk.lib import service_token
         | 
| 14 | 
            +
            from wandb.sdk.lib.exit_hooks import ExitHooks
         | 
| 15 | 
            +
            from wandb.sdk.lib.mailbox import Mailbox
         | 
| 16 | 
            +
            from wandb.sdk.lib.sock_client import SockClient, SockClientTimeoutError
         | 
| 17 | 
            +
            from wandb.sdk.service import service
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            class WandbServiceNotOwnedError(Exception):
         | 
| 21 | 
            +
                """Raised when the current process does not own the service process."""
         | 
| 22 | 
            +
             | 
| 23 | 
            +
             | 
| 24 | 
            +
            class WandbServiceConnectionError(Exception):
         | 
| 25 | 
            +
                """Raised on failure to connect to the service process."""
         | 
| 26 | 
            +
             | 
| 27 | 
            +
             | 
| 28 | 
            +
            class WandbAttachFailedError(Exception):
         | 
| 29 | 
            +
                """Raised if attaching to a run fails."""
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            def connect_to_service(
         | 
| 33 | 
            +
                settings: wandb_settings.Settings,
         | 
| 34 | 
            +
            ) -> ServiceConnection:
         | 
| 35 | 
            +
                """Connects to the service process, starting one up if necessary."""
         | 
| 36 | 
            +
                conn = _try_connect_to_existing_service()
         | 
| 37 | 
            +
                if conn:
         | 
| 38 | 
            +
                    return conn
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                return _start_and_connect_service(settings)
         | 
| 41 | 
            +
             | 
| 42 | 
            +
             | 
| 43 | 
            +
            def _try_connect_to_existing_service() -> ServiceConnection | None:
         | 
| 44 | 
            +
                """Attemps to connect to an existing service process."""
         | 
| 45 | 
            +
                token = service_token.get_service_token()
         | 
| 46 | 
            +
                if not token:
         | 
| 47 | 
            +
                    return None
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                # Only localhost sockets are supported below.
         | 
| 50 | 
            +
                assert token.host == "localhost"
         | 
| 51 | 
            +
                client = SockClient()
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                try:
         | 
| 54 | 
            +
                    # TODO: This may block indefinitely if the service is unhealthy.
         | 
| 55 | 
            +
                    client.connect(token.port)
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                except Exception as e:
         | 
| 58 | 
            +
                    raise WandbServiceConnectionError(
         | 
| 59 | 
            +
                        "Failed to connect to internal service."
         | 
| 60 | 
            +
                    ) from e
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                return ServiceConnection(client=client, proc=None)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
             | 
| 65 | 
            +
            def _start_and_connect_service(
         | 
| 66 | 
            +
                settings: wandb_settings.Settings,
         | 
| 67 | 
            +
            ) -> ServiceConnection:
         | 
| 68 | 
            +
                """Starts a service process and returns a connection to it.
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                An atexit hook is registered to tear down the service process and wait for
         | 
| 71 | 
            +
                it to complete. The hook does not run in processes started using the
         | 
| 72 | 
            +
                multiprocessing module.
         | 
| 73 | 
            +
                """
         | 
| 74 | 
            +
                proc = service._Service(settings)
         | 
| 75 | 
            +
                proc.start()
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                port = proc.sock_port
         | 
| 78 | 
            +
                assert port
         | 
| 79 | 
            +
                client = SockClient()
         | 
| 80 | 
            +
                client.connect(port)
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                service_token.set_service_token(
         | 
| 83 | 
            +
                    parent_pid=os.getpid(),
         | 
| 84 | 
            +
                    transport="tcp",
         | 
| 85 | 
            +
                    host="localhost",
         | 
| 86 | 
            +
                    port=port,
         | 
| 87 | 
            +
                )
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                hooks = ExitHooks()
         | 
| 90 | 
            +
                hooks.hook()
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                def teardown_atexit():
         | 
| 93 | 
            +
                    conn.teardown(hooks.exit_code)
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                conn = ServiceConnection(
         | 
| 96 | 
            +
                    client=client,
         | 
| 97 | 
            +
                    proc=proc,
         | 
| 98 | 
            +
                    cleanup=lambda: atexit.unregister(teardown_atexit),
         | 
| 99 | 
            +
                )
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                atexit.register(teardown_atexit)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                return conn
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            class ServiceConnection:
         | 
| 107 | 
            +
                """A connection to the W&B internal service process."""
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                def __init__(
         | 
| 110 | 
            +
                    self,
         | 
| 111 | 
            +
                    client: SockClient,
         | 
| 112 | 
            +
                    proc: service._Service | None,
         | 
| 113 | 
            +
                    cleanup: Callable[[], None] | None = None,
         | 
| 114 | 
            +
                ):
         | 
| 115 | 
            +
                    """Returns a new ServiceConnection.
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                    Args:
         | 
| 118 | 
            +
                        client: A socket that's connected to the service.
         | 
| 119 | 
            +
                        proc: The service process if we own it, or None otherwise.
         | 
| 120 | 
            +
                        cleanup: A callback to run on teardown before doing anything.
         | 
| 121 | 
            +
                    """
         | 
| 122 | 
            +
                    self._client = client
         | 
| 123 | 
            +
                    self._proc = proc
         | 
| 124 | 
            +
                    self._torn_down = False
         | 
| 125 | 
            +
                    self._cleanup = cleanup
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                def make_interface(self, mailbox: Mailbox) -> InterfaceBase:
         | 
| 128 | 
            +
                    """Returns an interface for communicating with the service."""
         | 
| 129 | 
            +
                    return InterfaceSock(self._client, mailbox)
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                def send_record(self, record: pb.Record) -> None:
         | 
| 132 | 
            +
                    """Sends data to the service."""
         | 
| 133 | 
            +
                    self._client.send_record_publish(record)
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                def inform_init(
         | 
| 136 | 
            +
                    self,
         | 
| 137 | 
            +
                    settings: wandb_settings_pb2.Settings,
         | 
| 138 | 
            +
                    run_id: str,
         | 
| 139 | 
            +
                ) -> None:
         | 
| 140 | 
            +
                    """Sends an init request to the service."""
         | 
| 141 | 
            +
                    request = spb.ServerInformInitRequest()
         | 
| 142 | 
            +
                    request.settings.CopyFrom(settings)
         | 
| 143 | 
            +
                    request._info.stream_id = run_id
         | 
| 144 | 
            +
                    self._client.send(inform_init=request)
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                def inform_finish(self, run_id: str) -> None:
         | 
| 147 | 
            +
                    """Sends an finish request to the service."""
         | 
| 148 | 
            +
                    request = spb.ServerInformFinishRequest()
         | 
| 149 | 
            +
                    request._info.stream_id = run_id
         | 
| 150 | 
            +
                    self._client.send(inform_finish=request)
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                def inform_attach(
         | 
| 153 | 
            +
                    self,
         | 
| 154 | 
            +
                    attach_id: str,
         | 
| 155 | 
            +
                ) -> wandb_settings_pb2.Settings:
         | 
| 156 | 
            +
                    """Sends an attach request to the service.
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    Raises a WandbAttachFailedError if attaching is not possible.
         | 
| 159 | 
            +
                    """
         | 
| 160 | 
            +
                    request = spb.ServerInformAttachRequest()
         | 
| 161 | 
            +
                    request._info.stream_id = attach_id
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                    try:
         | 
| 164 | 
            +
                        response = self._client.send_and_recv(inform_attach=request)
         | 
| 165 | 
            +
                        return response.inform_attach_response.settings
         | 
| 166 | 
            +
                    except SockClientTimeoutError:
         | 
| 167 | 
            +
                        raise WandbAttachFailedError(
         | 
| 168 | 
            +
                            "Could not attach because the run does not belong to"
         | 
| 169 | 
            +
                            " the current service process, or because the service"
         | 
| 170 | 
            +
                            " process is busy (unlikely)."
         | 
| 171 | 
            +
                        )
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                def inform_start(
         | 
| 174 | 
            +
                    self,
         | 
| 175 | 
            +
                    settings: wandb_settings_pb2.Settings,
         | 
| 176 | 
            +
                    run_id: str,
         | 
| 177 | 
            +
                ) -> None:
         | 
| 178 | 
            +
                    """Sends a start request to the service."""
         | 
| 179 | 
            +
                    request = spb.ServerInformStartRequest()
         | 
| 180 | 
            +
                    request.settings.CopyFrom(settings)
         | 
| 181 | 
            +
                    request._info.stream_id = run_id
         | 
| 182 | 
            +
                    self._client.send(inform_start=request)
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                def teardown(self, exit_code: int) -> int:
         | 
| 185 | 
            +
                    """Shuts down the service process and returns its exit code.
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    This may only be called once.
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                    Returns:
         | 
| 190 | 
            +
                        The exit code of the service process.
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                    Raises:
         | 
| 193 | 
            +
                        WandbServiceNotOwnedError: If the current process did not start
         | 
| 194 | 
            +
                            the service process.
         | 
| 195 | 
            +
                    """
         | 
| 196 | 
            +
                    if not self._proc:
         | 
| 197 | 
            +
                        raise WandbServiceNotOwnedError(
         | 
| 198 | 
            +
                            "Cannot tear down service started by different process",
         | 
| 199 | 
            +
                        )
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                    assert not self._torn_down
         | 
| 202 | 
            +
                    self._torn_down = True
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                    if self._cleanup:
         | 
| 205 | 
            +
                        self._cleanup()
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                    # Clear the service token to prevent new connections from being made.
         | 
| 208 | 
            +
                    service_token.clear_service_token()
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                    self._client.send(
         | 
| 211 | 
            +
                        inform_teardown=spb.ServerInformTeardownRequest(
         | 
| 212 | 
            +
                            exit_code=exit_code,
         | 
| 213 | 
            +
                        )
         | 
| 214 | 
            +
                    )
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                    return self._proc.join()
         | 
| @@ -0,0 +1,94 @@ | |
| 1 | 
            +
            from __future__ import annotations
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import dataclasses
         | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from wandb import env
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            _CURRENT_VERSION = "2"
         | 
| 9 | 
            +
            _SUPPORTED_TRANSPORTS = "tcp"
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            def get_service_token() -> ServiceToken | None:
         | 
| 13 | 
            +
                """Reads the token from environment variables.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                Returns:
         | 
| 16 | 
            +
                    The token if the correct environment variable is set, or None.
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                Raises:
         | 
| 19 | 
            +
                    ValueError: If the environment variable is set but cannot be
         | 
| 20 | 
            +
                        parsed.
         | 
| 21 | 
            +
                """
         | 
| 22 | 
            +
                token = os.environ.get(env.SERVICE)
         | 
| 23 | 
            +
                if not token:
         | 
| 24 | 
            +
                    return None
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                parts = token.split("-")
         | 
| 27 | 
            +
                if len(parts) != 5:
         | 
| 28 | 
            +
                    raise ValueError(f"Invalid token: {token}")
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                version, pid_str, transport, host, port_str = parts
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                if version != _CURRENT_VERSION:
         | 
| 33 | 
            +
                    raise ValueError(
         | 
| 34 | 
            +
                        f"Expected version {_CURRENT_VERSION},"
         | 
| 35 | 
            +
                        f" but got {version} (token={token})"
         | 
| 36 | 
            +
                    )
         | 
| 37 | 
            +
                if transport not in _SUPPORTED_TRANSPORTS:
         | 
| 38 | 
            +
                    raise ValueError(
         | 
| 39 | 
            +
                        f"Unsupported transport: {transport} (token={token})",
         | 
| 40 | 
            +
                    )
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                try:
         | 
| 43 | 
            +
                    return ServiceToken(
         | 
| 44 | 
            +
                        version=version,
         | 
| 45 | 
            +
                        pid=int(pid_str),
         | 
| 46 | 
            +
                        transport=transport,
         | 
| 47 | 
            +
                        host=host,
         | 
| 48 | 
            +
                        port=int(port_str),
         | 
| 49 | 
            +
                    )
         | 
| 50 | 
            +
                except ValueError as e:
         | 
| 51 | 
            +
                    raise ValueError(f"Invalid token: {token}") from e
         | 
| 52 | 
            +
             | 
| 53 | 
            +
             | 
| 54 | 
            +
            def set_service_token(parent_pid: int, transport: str, host: str, port: int) -> None:
         | 
| 55 | 
            +
                """Stores a service token in an environment variable.
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                Args:
         | 
| 58 | 
            +
                    parent_pid: The process ID of the process that started the service.
         | 
| 59 | 
            +
                    transport: The transport used to communicate with the service.
         | 
| 60 | 
            +
                    host: The host part of the internet address on which the service
         | 
| 61 | 
            +
                        is listening (e.g. localhost).
         | 
| 62 | 
            +
                    port: The port the service is listening on.
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                Raises:
         | 
| 65 | 
            +
                    ValueError: If given an unsupported transport.
         | 
| 66 | 
            +
                """
         | 
| 67 | 
            +
                if transport not in _SUPPORTED_TRANSPORTS:
         | 
| 68 | 
            +
                    raise ValueError(f"Unsupported transport: {transport}")
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                os.environ[env.SERVICE] = "-".join(
         | 
| 71 | 
            +
                    (
         | 
| 72 | 
            +
                        _CURRENT_VERSION,
         | 
| 73 | 
            +
                        str(parent_pid),
         | 
| 74 | 
            +
                        transport,
         | 
| 75 | 
            +
                        host,
         | 
| 76 | 
            +
                        str(port),
         | 
| 77 | 
            +
                    )
         | 
| 78 | 
            +
                )
         | 
| 79 | 
            +
             | 
| 80 | 
            +
             | 
| 81 | 
            +
            def clear_service_token() -> None:
         | 
| 82 | 
            +
                """Clears the environment variable storing the service token."""
         | 
| 83 | 
            +
                os.environ.pop(env.SERVICE, None)
         | 
| 84 | 
            +
             | 
| 85 | 
            +
             | 
| 86 | 
            +
            @dataclasses.dataclass(frozen=True)
         | 
| 87 | 
            +
            class ServiceToken:
         | 
| 88 | 
            +
                """An identifier for a running service process."""
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                version: str
         | 
| 91 | 
            +
                pid: int
         | 
| 92 | 
            +
                transport: str
         | 
| 93 | 
            +
                host: str
         | 
| 94 | 
            +
                port: int
         | 
    
        wandb/sdk/lib/sock_client.py
    CHANGED
    
    | @@ -14,9 +14,11 @@ if TYPE_CHECKING: | |
| 14 14 |  | 
| 15 15 |  | 
| 16 16 | 
             
            class SockClientClosedError(Exception):
         | 
| 17 | 
            -
                """ | 
| 17 | 
            +
                """Raised on operations on a closed socket."""
         | 
| 18 18 |  | 
| 19 | 
            -
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            class SockClientTimeoutError(Exception):
         | 
| 21 | 
            +
                """Raised if the server didn't respond before the timeout."""
         | 
| 20 22 |  | 
| 21 23 |  | 
| 22 24 | 
             
            class SockBuffer:
         | 
| @@ -182,8 +184,10 @@ class SockClient: | |
| 182 184 | 
             
                    # it should be relatively stable.
         | 
| 183 185 | 
             
                    # This pass would be solved as part of the fix in https://wandb.atlassian.net/browse/WB-8709
         | 
| 184 186 | 
             
                    response = self.read_server_response(timeout=1)
         | 
| 187 | 
            +
             | 
| 185 188 | 
             
                    if response is None:
         | 
| 186 | 
            -
                        raise  | 
| 189 | 
            +
                        raise SockClientTimeoutError("No response after 1 second.")
         | 
| 190 | 
            +
             | 
| 187 191 | 
             
                    return response
         | 
| 188 192 |  | 
| 189 193 | 
             
                def send(
         | 
    
        wandb/sdk/service/server.py
    CHANGED
    
    | @@ -20,7 +20,6 @@ class WandbServer: | |
| 20 20 | 
             
                _pid: Optional[int]
         | 
| 21 21 | 
             
                _sock_port: Optional[int]
         | 
| 22 22 | 
             
                _debug: bool
         | 
| 23 | 
            -
                _serve_sock: bool
         | 
| 24 23 | 
             
                _sock_server: Optional[SocketServer]
         | 
| 25 24 | 
             
                _startup_debug_enabled: bool
         | 
| 26 25 |  | 
| @@ -31,14 +30,12 @@ class WandbServer: | |
| 31 30 | 
             
                    address: Optional[str] = None,
         | 
| 32 31 | 
             
                    pid: Optional[int] = None,
         | 
| 33 32 | 
             
                    debug: bool = True,
         | 
| 34 | 
            -
                    serve_sock: bool = False,
         | 
| 35 33 | 
             
                ) -> None:
         | 
| 36 34 | 
             
                    self._sock_port = sock_port
         | 
| 37 35 | 
             
                    self._port_fname = port_fname
         | 
| 38 36 | 
             
                    self._address = address
         | 
| 39 37 | 
             
                    self._pid = pid
         | 
| 40 38 | 
             
                    self._debug = debug
         | 
| 41 | 
            -
                    self._serve_sock = serve_sock
         | 
| 42 39 | 
             
                    self._sock_server = None
         | 
| 43 40 | 
             
                    self._startup_debug_enabled = _startup_debug.is_enabled()
         | 
| 44 41 |  | 
| @@ -97,7 +94,7 @@ class WandbServer: | |
| 97 94 | 
             
                        pid = str(self._pid or 0)
         | 
| 98 95 | 
             
                        transport = "s" if sock_port else "g"
         | 
| 99 96 | 
             
                        port = sock_port or 0
         | 
| 100 | 
            -
                        # this format is similar to  | 
| 97 | 
            +
                        # this format is similar to the service token, but it's purely informative now
         | 
| 101 98 | 
             
                        # (consider unifying this in the future)
         | 
| 102 99 | 
             
                        service_id = f"{service_ver}-{pid}-{transport}-{port}"
         | 
| 103 100 | 
             
                        proc_title = f"wandb-service({service_id})"
         | 
| @@ -109,7 +106,7 @@ class WandbServer: | |
| 109 106 | 
             
                    self._setup_tracelog()
         | 
| 110 107 | 
             
                    mux = StreamMux()
         | 
| 111 108 | 
             
                    self._startup_debug_print("before_network")
         | 
| 112 | 
            -
                    sock_port = self._start_sock(mux=mux) | 
| 109 | 
            +
                    sock_port = self._start_sock(mux=mux)
         | 
| 113 110 | 
             
                    self._startup_debug_print("after_network")
         | 
| 114 111 | 
             
                    self._inform_used_ports(sock_port=sock_port)
         | 
| 115 112 | 
             
                    self._startup_debug_print("after_inform")
         | 
    
        wandb/sdk/service/service.py
    CHANGED
    
    | @@ -21,8 +21,6 @@ from wandb.sdk.lib.wburls import wburls | |
| 21 21 | 
             
            from wandb.util import get_core_path, get_module
         | 
| 22 22 |  | 
| 23 23 | 
             
            from . import _startup_debug, port_file
         | 
| 24 | 
            -
            from .service_base import ServiceInterface
         | 
| 25 | 
            -
            from .service_sock import ServiceSockInterface
         | 
| 26 24 |  | 
| 27 25 | 
             
            if TYPE_CHECKING:
         | 
| 28 26 | 
             
                from wandb.sdk.wandb_settings import Settings
         | 
| @@ -31,25 +29,18 @@ if TYPE_CHECKING: | |
| 31 29 | 
             
            class ServiceStartProcessError(Error):
         | 
| 32 30 | 
             
                """Raised when a known error occurs when launching wandb service."""
         | 
| 33 31 |  | 
| 34 | 
            -
                pass
         | 
| 35 | 
            -
             | 
| 36 32 |  | 
| 37 33 | 
             
            class ServiceStartTimeoutError(Error):
         | 
| 38 34 | 
             
                """Raised when service start times out."""
         | 
| 39 35 |  | 
| 40 | 
            -
                pass
         | 
| 41 | 
            -
             | 
| 42 36 |  | 
| 43 37 | 
             
            class ServiceStartPortError(Error):
         | 
| 44 38 | 
             
                """Raised when service start fails to find a port."""
         | 
| 45 39 |  | 
| 46 | 
            -
                pass
         | 
| 47 | 
            -
             | 
| 48 40 |  | 
| 49 41 | 
             
            class _Service:
         | 
| 50 42 | 
             
                _settings: "Settings"
         | 
| 51 43 | 
             
                _sock_port: Optional[int]
         | 
| 52 | 
            -
                _service_interface: ServiceInterface
         | 
| 53 44 | 
             
                _internal_proc: Optional[subprocess.Popen]
         | 
| 54 45 | 
             
                _startup_debug_enabled: bool
         | 
| 55 46 |  | 
| @@ -65,10 +56,6 @@ class _Service: | |
| 65 56 |  | 
| 66 57 | 
             
                    _sentry.configure_scope(tags=dict(settings), process_context="service")
         | 
| 67 58 |  | 
| 68 | 
            -
                    # current code only supports socket server implementation, in the
         | 
| 69 | 
            -
                    # future we might be able to support both
         | 
| 70 | 
            -
                    self._service_interface = ServiceSockInterface()
         | 
| 71 | 
            -
             | 
| 72 59 | 
             
                def _startup_debug_print(self, message: str) -> None:
         | 
| 73 60 | 
             
                    if not self._startup_debug_enabled:
         | 
| 74 61 | 
             
                        return
         | 
| @@ -175,10 +162,6 @@ class _Service: | |
| 175 162 | 
             
                            if core_debug(default="False"):
         | 
| 176 163 | 
             
                                service_args.append("--debug")
         | 
| 177 164 |  | 
| 178 | 
            -
                            trace_filename = os.environ.get("_WANDB_TRACE")
         | 
| 179 | 
            -
                            if trace_filename is not None:
         | 
| 180 | 
            -
                                service_args.extend(["--trace", trace_filename])
         | 
| 181 | 
            -
             | 
| 182 165 | 
             
                            exec_cmd_list = []
         | 
| 183 166 | 
             
                            termlog(
         | 
| 184 167 | 
             
                                "Using wandb-core as the SDK backend."
         | 
| @@ -194,7 +177,6 @@ class _Service: | |
| 194 177 | 
             
                            "--pid",
         | 
| 195 178 | 
             
                            pid,
         | 
| 196 179 | 
             
                        ]
         | 
| 197 | 
            -
                        service_args.append("--serve-sock")
         | 
| 198 180 |  | 
| 199 181 | 
             
                        if os.environ.get("WANDB_SERVICE_PROFILE") == "memray":
         | 
| 200 182 | 
             
                            _ = get_module(
         | 
| @@ -253,10 +235,6 @@ class _Service: | |
| 253 235 | 
             
                def sock_port(self) -> Optional[int]:
         | 
| 254 236 | 
             
                    return self._sock_port
         | 
| 255 237 |  | 
| 256 | 
            -
                @property
         | 
| 257 | 
            -
                def service_interface(self) -> ServiceInterface:
         | 
| 258 | 
            -
                    return self._service_interface
         | 
| 259 | 
            -
             | 
| 260 238 | 
             
                def join(self) -> int:
         | 
| 261 239 | 
             
                    ret = 0
         | 
| 262 240 | 
             
                    if self._internal_proc:
         | 
    
        wandb/sdk/wandb_init.py
    CHANGED
    
    | @@ -655,9 +655,9 @@ class _WandbInit: | |
| 655 655 | 
             
                                    f"Successfully finished last run (ID:{latest_run._run_id}). Initializing new run:<br/>"
         | 
| 656 656 | 
             
                                )
         | 
| 657 657 | 
             
                    elif isinstance(wandb.run, Run):
         | 
| 658 | 
            -
                         | 
| 658 | 
            +
                        service = self._wl.service
         | 
| 659 659 | 
             
                        # We shouldn't return a stale global run if we are in a new pid
         | 
| 660 | 
            -
                        if not  | 
| 660 | 
            +
                        if not service or os.getpid() == wandb.run._init_pid:
         | 
| 661 661 | 
             
                            logger.info("wandb.init() called when a run is still active")
         | 
| 662 662 | 
             
                            with telemetry.context() as tel:
         | 
| 663 663 | 
             
                                tel.feature.init_return_run = True
         | 
| @@ -665,15 +665,20 @@ class _WandbInit: | |
| 665 665 |  | 
| 666 666 | 
             
                    logger.info("starting backend")
         | 
| 667 667 |  | 
| 668 | 
            -
                     | 
| 669 | 
            -
                    if  | 
| 670 | 
            -
                        logger.info(" | 
| 671 | 
            -
                         | 
| 672 | 
            -
                            settings=self.settings.to_proto(), | 
| 668 | 
            +
                    service = self._wl.service
         | 
| 669 | 
            +
                    if service:
         | 
| 670 | 
            +
                        logger.info("sending inform_init request")
         | 
| 671 | 
            +
                        service.inform_init(
         | 
| 672 | 
            +
                            settings=self.settings.to_proto(),
         | 
| 673 | 
            +
                            run_id=self.settings.run_id,
         | 
| 673 674 | 
             
                        )
         | 
| 674 675 |  | 
| 675 676 | 
             
                    mailbox = Mailbox()
         | 
| 676 | 
            -
                    backend = Backend( | 
| 677 | 
            +
                    backend = Backend(
         | 
| 678 | 
            +
                        settings=self.settings,
         | 
| 679 | 
            +
                        service=service,
         | 
| 680 | 
            +
                        mailbox=mailbox,
         | 
| 681 | 
            +
                    )
         | 
| 677 682 | 
             
                    backend.ensure_launched()
         | 
| 678 683 | 
             
                    logger.info("backend started and connected")
         | 
| 679 684 | 
             
                    # Make sure we are logged in
         | 
| @@ -739,7 +744,7 @@ class _WandbInit: | |
| 739 744 | 
             
                        if os.environ.get(wandb.env._DISABLE_SERVICE):
         | 
| 740 745 | 
             
                            tel.feature.service_disabled = True
         | 
| 741 746 |  | 
| 742 | 
            -
                        if  | 
| 747 | 
            +
                        if service:
         | 
| 743 748 | 
             
                            tel.feature.service = True
         | 
| 744 749 | 
             
                        if self.settings._flow_control_disabled:
         | 
| 745 750 | 
             
                            tel.feature.flow_control_disabled = True
         | 
| @@ -830,7 +835,7 @@ class _WandbInit: | |
| 830 835 |  | 
| 831 836 | 
             
                    if error is not None:
         | 
| 832 837 | 
             
                        logger.error(f"encountered error: {error}")
         | 
| 833 | 
            -
                        if not  | 
| 838 | 
            +
                        if not service:
         | 
| 834 839 | 
             
                            # Shutdown the backend and get rid of the logger
         | 
| 835 840 | 
             
                            # we don't need to do console cleanup at this point
         | 
| 836 841 | 
             
                            backend.cleanup()
         | 
| @@ -857,9 +862,10 @@ class _WandbInit: | |
| 857 862 | 
             
                    logger.info("starting run threads in backend")
         | 
| 858 863 | 
             
                    # initiate run (stats and metadata probing)
         | 
| 859 864 |  | 
| 860 | 
            -
                    if  | 
| 861 | 
            -
                         | 
| 862 | 
            -
                            settings=self.settings.to_proto(), | 
| 865 | 
            +
                    if service:
         | 
| 866 | 
            +
                        service.inform_start(
         | 
| 867 | 
            +
                            settings=self.settings.to_proto(),
         | 
| 868 | 
            +
                            run_id=self.settings.run_id,
         | 
| 863 869 | 
             
                        )
         | 
| 864 870 |  | 
| 865 871 | 
             
                    assert backend.interface
         | 
| @@ -934,33 +940,37 @@ def _attach( | |
| 934 940 | 
             
                if logger is None:
         | 
| 935 941 | 
             
                    raise UsageError("logger is not initialized")
         | 
| 936 942 |  | 
| 937 | 
            -
                 | 
| 938 | 
            -
                 | 
| 939 | 
            -
             | 
| 940 | 
            -
             | 
| 943 | 
            +
                service = _wl.service
         | 
| 944 | 
            +
                if not service:
         | 
| 945 | 
            +
                    raise UsageError(f"Unable to attach to run {attach_id} (no service process)")
         | 
| 946 | 
            +
             | 
| 947 | 
            +
                try:
         | 
| 948 | 
            +
                    attach_settings = service.inform_attach(attach_id=attach_id)
         | 
| 949 | 
            +
                except Exception as e:
         | 
| 950 | 
            +
                    raise UsageError(f"Unable to attach to run {attach_id}") from e
         | 
| 941 951 |  | 
| 942 952 | 
             
                settings: Settings = copy.copy(_wl._settings)
         | 
| 943 953 |  | 
| 944 954 | 
             
                settings.update(
         | 
| 945 955 | 
             
                    {
         | 
| 946 956 | 
             
                        "run_id": attach_id,
         | 
| 947 | 
            -
                        "_start_time":  | 
| 948 | 
            -
                        "_start_datetime":  | 
| 949 | 
            -
                        "_offline":  | 
| 957 | 
            +
                        "_start_time": attach_settings._start_time.value,
         | 
| 958 | 
            +
                        "_start_datetime": attach_settings._start_datetime.value,
         | 
| 959 | 
            +
                        "_offline": attach_settings._offline.value,
         | 
| 950 960 | 
             
                    },
         | 
| 951 961 | 
             
                    source=Source.INIT,
         | 
| 952 962 | 
             
                )
         | 
| 953 963 |  | 
| 954 964 | 
             
                # TODO: consolidate this codepath with wandb.init()
         | 
| 955 965 | 
             
                mailbox = Mailbox()
         | 
| 956 | 
            -
                backend = Backend(settings=settings,  | 
| 966 | 
            +
                backend = Backend(settings=settings, service=service, mailbox=mailbox)
         | 
| 957 967 | 
             
                backend.ensure_launched()
         | 
| 958 968 | 
             
                logger.info("attach backend started and connected")
         | 
| 959 969 |  | 
| 960 970 | 
             
                if run is None:
         | 
| 961 971 | 
             
                    run = Run(settings=settings)
         | 
| 962 972 | 
             
                else:
         | 
| 963 | 
            -
                    run._init()
         | 
| 973 | 
            +
                    run._init(settings=settings)
         | 
| 964 974 | 
             
                run._set_library(_wl)
         | 
| 965 975 | 
             
                run._set_backend(backend)
         | 
| 966 976 | 
             
                backend._hack_set_run(run)
         |