wandb 0.20.1__py3-none-any.whl → 0.20.2rc20250616__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -6
- wandb/__init__.pyi +1 -1
- wandb/analytics/sentry.py +2 -2
- wandb/apis/importers/internals/internal.py +0 -3
- wandb/apis/public/api.py +2 -2
- wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
- wandb/apis/public/registries/registries_search.py +2 -2
- wandb/apis/public/registries/registry.py +19 -18
- wandb/bin/gpu_stats +0 -0
- wandb/cli/beta.py +1 -7
- wandb/cli/cli.py +0 -30
- wandb/env.py +0 -6
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/internal/handler.py +1 -69
- wandb/sdk/lib/printer.py +6 -7
- wandb/sdk/lib/progress.py +1 -3
- wandb/sdk/lib/service/ipc_support.py +13 -0
- wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
- wandb/sdk/lib/service/service_port_file.py +105 -0
- wandb/sdk/lib/service/service_process.py +111 -0
- wandb/sdk/lib/service/service_token.py +164 -0
- wandb/sdk/lib/sock_client.py +8 -12
- wandb/sdk/wandb_init.py +0 -3
- wandb/sdk/wandb_require.py +9 -20
- wandb/sdk/wandb_run.py +0 -24
- wandb/sdk/wandb_settings.py +0 -9
- wandb/sdk/wandb_setup.py +2 -13
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +41 -67
- wandb/sdk/internal/flow_control.py +0 -263
- wandb/sdk/internal/internal.py +0 -401
- wandb/sdk/internal/internal_util.py +0 -97
- wandb/sdk/internal/system/__init__.py +0 -0
- wandb/sdk/internal/system/assets/__init__.py +0 -25
- wandb/sdk/internal/system/assets/aggregators.py +0 -31
- wandb/sdk/internal/system/assets/asset_registry.py +0 -20
- wandb/sdk/internal/system/assets/cpu.py +0 -163
- wandb/sdk/internal/system/assets/disk.py +0 -210
- wandb/sdk/internal/system/assets/gpu.py +0 -416
- wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
- wandb/sdk/internal/system/assets/interfaces.py +0 -205
- wandb/sdk/internal/system/assets/ipu.py +0 -177
- wandb/sdk/internal/system/assets/memory.py +0 -166
- wandb/sdk/internal/system/assets/network.py +0 -125
- wandb/sdk/internal/system/assets/open_metrics.py +0 -293
- wandb/sdk/internal/system/assets/tpu.py +0 -154
- wandb/sdk/internal/system/assets/trainium.py +0 -393
- wandb/sdk/internal/system/env_probe_helpers.py +0 -13
- wandb/sdk/internal/system/system_info.py +0 -248
- wandb/sdk/internal/system/system_monitor.py +0 -224
- wandb/sdk/internal/writer.py +0 -204
- wandb/sdk/lib/service_token.py +0 -93
- wandb/sdk/service/__init__.py +0 -0
- wandb/sdk/service/_startup_debug.py +0 -22
- wandb/sdk/service/port_file.py +0 -53
- wandb/sdk/service/server.py +0 -107
- wandb/sdk/service/server_sock.py +0 -286
- wandb/sdk/service/service.py +0 -252
- wandb/sdk/service/streams.py +0 -425
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/lib/printer.py
CHANGED
@@ -349,13 +349,13 @@ class _PrinterTerm(Printer):
|
|
349
349
|
text = text or " " * 79
|
350
350
|
wandb.termlog(text)
|
351
351
|
|
352
|
-
@override
|
353
352
|
@property
|
353
|
+
@override
|
354
354
|
def supports_html(self) -> bool:
|
355
355
|
return False
|
356
356
|
|
357
|
-
@override
|
358
357
|
@property
|
358
|
+
@override
|
359
359
|
def supports_unicode(self) -> bool:
|
360
360
|
return wandb.util.is_unicode_safe(sys.stderr)
|
361
361
|
|
@@ -464,11 +464,10 @@ class _PrinterJupyter(Printer):
|
|
464
464
|
|
465
465
|
if handle:
|
466
466
|
yield _DynamicJupyterText(handle)
|
467
|
+
handle.update(self._ipython_display.HTML(""))
|
467
468
|
else:
|
468
469
|
yield None
|
469
470
|
|
470
|
-
handle.update(self._ipython_display.HTML(""))
|
471
|
-
|
472
471
|
@override
|
473
472
|
def display(
|
474
473
|
self,
|
@@ -483,13 +482,13 @@ class _PrinterJupyter(Printer):
|
|
483
482
|
text = "<br>".join(text.splitlines())
|
484
483
|
self._ipython_display.display(self._ipython_display.HTML(text))
|
485
484
|
|
486
|
-
@override
|
487
485
|
@property
|
486
|
+
@override
|
488
487
|
def supports_html(self) -> bool:
|
489
488
|
return True
|
490
489
|
|
491
|
-
@override
|
492
490
|
@property
|
491
|
+
@override
|
493
492
|
def supports_unicode(self) -> bool:
|
494
493
|
return True
|
495
494
|
|
@@ -540,7 +539,7 @@ class _PrinterJupyter(Printer):
|
|
540
539
|
self._progress.update(percent_done, text)
|
541
540
|
|
542
541
|
@override
|
543
|
-
def progress_close(self,
|
542
|
+
def progress_close(self, text: str | None = None) -> None:
|
544
543
|
if self._progress:
|
545
544
|
self._progress.close()
|
546
545
|
|
wandb/sdk/lib/progress.py
CHANGED
@@ -7,7 +7,6 @@ import contextlib
|
|
7
7
|
import time
|
8
8
|
from typing import Iterable, Iterator, NoReturn
|
9
9
|
|
10
|
-
from wandb import env
|
11
10
|
from wandb.proto import wandb_internal_pb2 as pb
|
12
11
|
from wandb.sdk.interface import interface
|
13
12
|
from wandb.sdk.lib import asyncio_compat
|
@@ -107,8 +106,7 @@ class ProgressPrinter:
|
|
107
106
|
progress_text_area: p.DynamicText | None,
|
108
107
|
default_text: str,
|
109
108
|
) -> None:
|
110
|
-
|
111
|
-
self._show_operation_stats = not env.is_require_legacy_service()
|
109
|
+
self._show_operation_stats = True
|
112
110
|
self._printer = printer
|
113
111
|
self._progress_text_area = progress_text_area
|
114
112
|
self._default_text = default_text
|
@@ -0,0 +1,13 @@
|
|
1
|
+
"""Constants determining what IPC methods are supported."""
|
2
|
+
|
3
|
+
import socket
|
4
|
+
|
5
|
+
SUPPORTS_UNIX = hasattr(socket, "AF_UNIX")
|
6
|
+
"""Whether Unix sockets are supported.
|
7
|
+
|
8
|
+
AF_UNIX is not supported on Windows:
|
9
|
+
https://github.com/python/cpython/issues/77589
|
10
|
+
|
11
|
+
Windows has supported Unix sockets since ~2017, but support in Python is
|
12
|
+
missing as of 2025.
|
13
|
+
"""
|
@@ -1,7 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import atexit
|
4
|
-
import os
|
5
4
|
from typing import Callable
|
6
5
|
|
7
6
|
from wandb.proto import wandb_internal_pb2 as pb
|
@@ -11,77 +10,42 @@ from wandb.sdk import wandb_settings
|
|
11
10
|
from wandb.sdk.interface.interface import InterfaceBase
|
12
11
|
from wandb.sdk.interface.interface_sock import InterfaceSock
|
13
12
|
from wandb.sdk.interface.router_sock import MessageSockRouter
|
14
|
-
from wandb.sdk.lib import service_token
|
15
13
|
from wandb.sdk.lib.exit_hooks import ExitHooks
|
16
14
|
from wandb.sdk.lib.sock_client import SockClient, SockClientClosedError
|
17
15
|
from wandb.sdk.mailbox import HandleAbandonedError, Mailbox, MailboxClosedError
|
18
|
-
from wandb.sdk.service import service
|
19
16
|
|
20
|
-
|
21
|
-
class WandbServiceConnectionError(Exception):
|
22
|
-
"""Raised on failure to connect to the service process."""
|
17
|
+
from . import service_process, service_token
|
23
18
|
|
24
19
|
|
25
20
|
class WandbAttachFailedError(Exception):
|
26
|
-
"""
|
21
|
+
"""Failed to attach to a run."""
|
27
22
|
|
28
23
|
|
29
24
|
def connect_to_service(
|
30
25
|
settings: wandb_settings.Settings,
|
31
26
|
) -> ServiceConnection:
|
32
|
-
"""
|
33
|
-
|
34
|
-
if conn:
|
35
|
-
return conn
|
36
|
-
|
37
|
-
return _start_and_connect_service(settings)
|
38
|
-
|
39
|
-
|
40
|
-
def _try_connect_to_existing_service() -> ServiceConnection | None:
|
41
|
-
"""Attempts to connect to an existing service process."""
|
42
|
-
token = service_token.get_service_token()
|
43
|
-
if not token:
|
44
|
-
return None
|
45
|
-
|
46
|
-
# Only localhost sockets are supported below.
|
47
|
-
assert token.host == "localhost"
|
48
|
-
client = SockClient()
|
27
|
+
"""Connect to the service process, starting one up if necessary."""
|
28
|
+
token = service_token.from_env()
|
49
29
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
except Exception as e:
|
55
|
-
raise WandbServiceConnectionError(
|
56
|
-
"Failed to connect to internal service."
|
57
|
-
) from e
|
58
|
-
|
59
|
-
return ServiceConnection(client=client, proc=None)
|
30
|
+
if token:
|
31
|
+
return ServiceConnection(client=token.connect(), proc=None)
|
32
|
+
else:
|
33
|
+
return _start_and_connect_service(settings)
|
60
34
|
|
61
35
|
|
62
36
|
def _start_and_connect_service(
|
63
37
|
settings: wandb_settings.Settings,
|
64
38
|
) -> ServiceConnection:
|
65
|
-
"""
|
39
|
+
"""Start a service process and returns a connection to it.
|
66
40
|
|
67
41
|
An atexit hook is registered to tear down the service process and wait for
|
68
42
|
it to complete. The hook does not run in processes started using the
|
69
43
|
multiprocessing module.
|
70
44
|
"""
|
71
|
-
proc =
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
assert port
|
76
|
-
client = SockClient()
|
77
|
-
client.connect(port)
|
78
|
-
|
79
|
-
service_token.set_service_token(
|
80
|
-
parent_pid=os.getpid(),
|
81
|
-
transport="tcp",
|
82
|
-
host="localhost",
|
83
|
-
port=port,
|
84
|
-
)
|
45
|
+
proc = service_process.start(settings)
|
46
|
+
|
47
|
+
client = proc.token.connect()
|
48
|
+
proc.token.save_to_env()
|
85
49
|
|
86
50
|
hooks = ExitHooks()
|
87
51
|
hooks.hook()
|
@@ -106,7 +70,7 @@ class ServiceConnection:
|
|
106
70
|
def __init__(
|
107
71
|
self,
|
108
72
|
client: SockClient,
|
109
|
-
proc:
|
73
|
+
proc: service_process.ServiceProcess | None,
|
110
74
|
cleanup: Callable[[], None] | None = None,
|
111
75
|
):
|
112
76
|
"""Returns a new ServiceConnection.
|
@@ -132,7 +96,7 @@ class ServiceConnection:
|
|
132
96
|
return InterfaceSock(self._client, self._mailbox, stream_id=stream_id)
|
133
97
|
|
134
98
|
def send_record(self, record: pb.Record) -> None:
|
135
|
-
"""
|
99
|
+
"""Send data to the service."""
|
136
100
|
self._client.send_record_publish(record)
|
137
101
|
|
138
102
|
def inform_init(
|
@@ -140,14 +104,14 @@ class ServiceConnection:
|
|
140
104
|
settings: wandb_settings_pb2.Settings,
|
141
105
|
run_id: str,
|
142
106
|
) -> None:
|
143
|
-
"""
|
107
|
+
"""Send an init request to the service."""
|
144
108
|
request = spb.ServerInformInitRequest()
|
145
109
|
request.settings.CopyFrom(settings)
|
146
110
|
request._info.stream_id = run_id
|
147
111
|
self._client.send_server_request(spb.ServerRequest(inform_init=request))
|
148
112
|
|
149
113
|
def inform_finish(self, run_id: str) -> None:
|
150
|
-
"""
|
114
|
+
"""Send an finish request to the service."""
|
151
115
|
request = spb.ServerInformFinishRequest()
|
152
116
|
request._info.stream_id = run_id
|
153
117
|
self._client.send_server_request(spb.ServerRequest(inform_finish=request))
|
@@ -156,7 +120,7 @@ class ServiceConnection:
|
|
156
120
|
self,
|
157
121
|
attach_id: str,
|
158
122
|
) -> wandb_settings_pb2.Settings:
|
159
|
-
"""
|
123
|
+
"""Send an attach request to the service.
|
160
124
|
|
161
125
|
Raises a WandbAttachFailedError if attaching is not possible.
|
162
126
|
"""
|
@@ -188,7 +152,7 @@ class ServiceConnection:
|
|
188
152
|
settings: wandb_settings_pb2.Settings,
|
189
153
|
run_id: str,
|
190
154
|
) -> None:
|
191
|
-
"""
|
155
|
+
"""Send a start request to the service."""
|
192
156
|
request = spb.ServerInformStartRequest()
|
193
157
|
request.settings.CopyFrom(settings)
|
194
158
|
request._info.stream_id = run_id
|
@@ -221,7 +185,7 @@ class ServiceConnection:
|
|
221
185
|
return None
|
222
186
|
|
223
187
|
# Clear the service token to prevent new connections to the process.
|
224
|
-
service_token.
|
188
|
+
service_token.clear_service_in_env()
|
225
189
|
|
226
190
|
self._client.send_server_request(
|
227
191
|
spb.ServerRequest(
|
@@ -0,0 +1,105 @@
|
|
1
|
+
"""Module for figuring out how to connect to the service process."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import os
|
6
|
+
import pathlib
|
7
|
+
import re
|
8
|
+
import subprocess
|
9
|
+
import time
|
10
|
+
|
11
|
+
import wandb
|
12
|
+
|
13
|
+
from . import ipc_support, service_token
|
14
|
+
|
15
|
+
# Time functions are monkeypatched in unit tests.
|
16
|
+
_MONOTONIC = time.monotonic
|
17
|
+
_SLEEP = time.sleep
|
18
|
+
|
19
|
+
|
20
|
+
class ServicePollForTokenError(wandb.Error):
|
21
|
+
"""Failed to discover how to connect to the service."""
|
22
|
+
|
23
|
+
|
24
|
+
def poll_for_token(
|
25
|
+
file: pathlib.Path,
|
26
|
+
proc: subprocess.Popen,
|
27
|
+
*,
|
28
|
+
timeout: float,
|
29
|
+
) -> service_token.ServiceToken:
|
30
|
+
"""Poll the 'port' file to discover how to connect to the service.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
file: The file path that should eventually contain this information.
|
34
|
+
proc: The process that's supposed to generate the file.
|
35
|
+
If the process dies, this raises an error.
|
36
|
+
timeout: A timeout in seconds after which to raise an error.
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
A token specifying how to connect to the service process.
|
40
|
+
|
41
|
+
Raises:
|
42
|
+
ServicePollForTokenError: if the service process dies, a timeout
|
43
|
+
occurs, or there's an issue reading the port file.
|
44
|
+
"""
|
45
|
+
end_time = _MONOTONIC() + timeout
|
46
|
+
|
47
|
+
while _MONOTONIC() < end_time:
|
48
|
+
if (code := proc.poll()) is not None:
|
49
|
+
raise ServicePollForTokenError(
|
50
|
+
f"wandb-core exited with code {code}",
|
51
|
+
context={
|
52
|
+
"command": proc.args,
|
53
|
+
"proc_out": proc.stdout.read() if proc.stdout else "",
|
54
|
+
"proc_err": proc.stderr.read() if proc.stderr else "",
|
55
|
+
},
|
56
|
+
)
|
57
|
+
|
58
|
+
if token := _poll_once(file):
|
59
|
+
return token
|
60
|
+
|
61
|
+
_SLEEP(max(0, min(0.2, end_time - _MONOTONIC())))
|
62
|
+
|
63
|
+
raise ServicePollForTokenError(
|
64
|
+
f"Failed to read port info after {timeout} seconds.",
|
65
|
+
)
|
66
|
+
|
67
|
+
|
68
|
+
_UNIX_NAME_RE = re.compile(r"unix=(.+)")
|
69
|
+
_TCP_PORT_RE = re.compile(r"sock=(\d+)")
|
70
|
+
|
71
|
+
|
72
|
+
def _poll_once(file: pathlib.Path) -> service_token.ServiceToken | None:
|
73
|
+
"""Try to read the port file.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
A connection token on success. Otherwise, returns None.
|
77
|
+
|
78
|
+
Raises:
|
79
|
+
ServicePollForTokenError: if the file contains no known
|
80
|
+
connection method.
|
81
|
+
"""
|
82
|
+
try:
|
83
|
+
text = file.read_text()
|
84
|
+
except OSError:
|
85
|
+
return None
|
86
|
+
|
87
|
+
lines = text.splitlines()
|
88
|
+
if lines[-1] != "EOF":
|
89
|
+
return None
|
90
|
+
|
91
|
+
for line in lines:
|
92
|
+
if ipc_support.SUPPORTS_UNIX and (match := _UNIX_NAME_RE.fullmatch(line)):
|
93
|
+
return service_token.UnixServiceToken(
|
94
|
+
parent_pid=os.getpid(),
|
95
|
+
path=match.group(1),
|
96
|
+
)
|
97
|
+
elif match := _TCP_PORT_RE.fullmatch(line):
|
98
|
+
return service_token.TCPServiceToken(
|
99
|
+
parent_pid=os.getpid(),
|
100
|
+
port=int(match.group(1)),
|
101
|
+
)
|
102
|
+
|
103
|
+
raise ServicePollForTokenError(
|
104
|
+
f"No known connection method in {file}:\n{text}",
|
105
|
+
)
|
@@ -0,0 +1,111 @@
|
|
1
|
+
"""Module for starting up the service process (wandb-core)."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import os
|
6
|
+
import pathlib
|
7
|
+
import platform
|
8
|
+
import subprocess
|
9
|
+
import tempfile
|
10
|
+
from typing import TYPE_CHECKING
|
11
|
+
|
12
|
+
from wandb import _sentry
|
13
|
+
from wandb.env import core_debug, dcgm_profiling_enabled, error_reporting_enabled
|
14
|
+
from wandb.errors import WandbCoreNotAvailableError
|
15
|
+
from wandb.sdk.lib.service import ipc_support
|
16
|
+
from wandb.util import get_core_path
|
17
|
+
|
18
|
+
from . import service_port_file, service_token
|
19
|
+
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from wandb.sdk.wandb_settings import Settings
|
22
|
+
|
23
|
+
|
24
|
+
def start(settings: Settings) -> ServiceProcess:
|
25
|
+
"""Start the internal service process.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
A handle to the process.
|
29
|
+
"""
|
30
|
+
_sentry.configure_scope(tags=dict(settings), process_context="service")
|
31
|
+
|
32
|
+
try:
|
33
|
+
return _launch_server(settings)
|
34
|
+
except Exception as e:
|
35
|
+
_sentry.reraise(e)
|
36
|
+
|
37
|
+
|
38
|
+
class ServiceProcess:
|
39
|
+
"""A handle to a process running the internal service."""
|
40
|
+
|
41
|
+
def __init__(
|
42
|
+
self,
|
43
|
+
*,
|
44
|
+
connection_token: service_token.ServiceToken,
|
45
|
+
process: subprocess.Popen,
|
46
|
+
) -> None:
|
47
|
+
self._token = connection_token
|
48
|
+
self._process = process
|
49
|
+
|
50
|
+
@property
|
51
|
+
def token(self) -> service_token.ServiceToken:
|
52
|
+
"""A token for connecting to the process."""
|
53
|
+
return self._token
|
54
|
+
|
55
|
+
def join(self) -> int:
|
56
|
+
"""Wait for the process to end and return its exit code."""
|
57
|
+
return self._process.wait()
|
58
|
+
|
59
|
+
|
60
|
+
def _launch_server(settings: Settings) -> ServiceProcess:
|
61
|
+
"""Launch server and set ports."""
|
62
|
+
if platform.system() == "Windows":
|
63
|
+
creationflags: int = subprocess.CREATE_NEW_PROCESS_GROUP # type: ignore[attr-defined]
|
64
|
+
start_new_session = False
|
65
|
+
else:
|
66
|
+
creationflags = 0
|
67
|
+
start_new_session = True
|
68
|
+
|
69
|
+
pid = str(os.getpid())
|
70
|
+
|
71
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
72
|
+
port_file = pathlib.Path(tmpdir, f"port-{pid}.txt")
|
73
|
+
service_args: list[str] = []
|
74
|
+
|
75
|
+
try:
|
76
|
+
core_path = get_core_path()
|
77
|
+
except WandbCoreNotAvailableError as e:
|
78
|
+
_sentry.reraise(e)
|
79
|
+
|
80
|
+
service_args.extend([core_path])
|
81
|
+
|
82
|
+
if not error_reporting_enabled():
|
83
|
+
service_args.append("--no-observability")
|
84
|
+
|
85
|
+
if core_debug(default="False"):
|
86
|
+
service_args.extend(["--log-level", "-4"])
|
87
|
+
|
88
|
+
if dcgm_profiling_enabled():
|
89
|
+
service_args.append("--enable-dcgm-profiling")
|
90
|
+
|
91
|
+
service_args.extend(["--port-filename", str(port_file)])
|
92
|
+
service_args.extend(["--pid", pid])
|
93
|
+
|
94
|
+
if not ipc_support.SUPPORTS_UNIX:
|
95
|
+
service_args.append("--listen-on-localhost")
|
96
|
+
|
97
|
+
proc = subprocess.Popen(
|
98
|
+
service_args,
|
99
|
+
env=os.environ,
|
100
|
+
close_fds=True,
|
101
|
+
creationflags=creationflags,
|
102
|
+
start_new_session=start_new_session,
|
103
|
+
)
|
104
|
+
|
105
|
+
token = service_port_file.poll_for_token(
|
106
|
+
port_file,
|
107
|
+
proc,
|
108
|
+
timeout=settings.x_service_wait,
|
109
|
+
)
|
110
|
+
|
111
|
+
return ServiceProcess(connection_token=token, process=proc)
|
@@ -0,0 +1,164 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import abc
|
4
|
+
import os
|
5
|
+
import re
|
6
|
+
import socket
|
7
|
+
|
8
|
+
from typing_extensions import final, override
|
9
|
+
|
10
|
+
from wandb import env
|
11
|
+
from wandb.sdk.lib.service import ipc_support
|
12
|
+
from wandb.sdk.lib.sock_client import SockClient
|
13
|
+
|
14
|
+
_CURRENT_VERSION = "3"
|
15
|
+
|
16
|
+
# Token formats:
|
17
|
+
_UNIX_TOKEN_RE = re.compile(rf"{_CURRENT_VERSION}-(\d+)-unix-(.+)")
|
18
|
+
_TCP_TOKEN_RE = re.compile(rf"{_CURRENT_VERSION}-(\d+)-tcp-localhost-(\d+)")
|
19
|
+
|
20
|
+
|
21
|
+
class WandbServiceConnectionError(Exception):
|
22
|
+
"""Failed to connect to the service process."""
|
23
|
+
|
24
|
+
|
25
|
+
def clear_service_in_env() -> None:
|
26
|
+
"""Clear the environment variable that stores the service token."""
|
27
|
+
os.environ.pop(env.SERVICE, None)
|
28
|
+
|
29
|
+
|
30
|
+
def from_env() -> ServiceToken | None:
|
31
|
+
"""Read the token from environment variables.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
The token if the correct environment variable is set, or None.
|
35
|
+
|
36
|
+
Raises:
|
37
|
+
ValueError: If the environment variable is set but cannot be
|
38
|
+
parsed.
|
39
|
+
"""
|
40
|
+
token = os.environ.get(env.SERVICE)
|
41
|
+
if not token:
|
42
|
+
return None
|
43
|
+
|
44
|
+
if unix_token := UnixServiceToken.from_env_string(token):
|
45
|
+
return unix_token
|
46
|
+
if tcp_token := TCPServiceToken.from_env_string(token):
|
47
|
+
return tcp_token
|
48
|
+
|
49
|
+
raise ValueError(f"Failed to parse {env.SERVICE}={token!r}")
|
50
|
+
|
51
|
+
|
52
|
+
class ServiceToken(abc.ABC):
|
53
|
+
"""A way of connecting to a running service process."""
|
54
|
+
|
55
|
+
@abc.abstractmethod
|
56
|
+
def connect(self) -> SockClient:
|
57
|
+
"""Connect to the service process.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
A socket object for communicating with the service.
|
61
|
+
|
62
|
+
Raises:
|
63
|
+
WandbServiceConnectionError: on failure to connect.
|
64
|
+
"""
|
65
|
+
|
66
|
+
def save_to_env(self) -> None:
|
67
|
+
"""Save the token in this process's environment variables."""
|
68
|
+
os.environ[env.SERVICE] = self._as_env_string()
|
69
|
+
|
70
|
+
@abc.abstractmethod
|
71
|
+
def _as_env_string(self) -> str:
|
72
|
+
"""Returns a string representation of this token."""
|
73
|
+
|
74
|
+
|
75
|
+
@final
|
76
|
+
class UnixServiceToken(ServiceToken):
|
77
|
+
"""Connects to the service using a Unix domain socket."""
|
78
|
+
|
79
|
+
def __init__(self, *, parent_pid: int, path: str) -> None:
|
80
|
+
self._parent_pid = parent_pid
|
81
|
+
self._path = path
|
82
|
+
|
83
|
+
@override
|
84
|
+
def connect(self) -> SockClient:
|
85
|
+
if not ipc_support.SUPPORTS_UNIX:
|
86
|
+
raise WandbServiceConnectionError("AF_UNIX socket not supported")
|
87
|
+
|
88
|
+
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
89
|
+
|
90
|
+
try:
|
91
|
+
# TODO: This may block indefinitely if the service is unhealthy.
|
92
|
+
sock.connect(self._path)
|
93
|
+
except Exception as e:
|
94
|
+
raise WandbServiceConnectionError(
|
95
|
+
f"Failed to connect to service on socket {self._path}",
|
96
|
+
) from e
|
97
|
+
|
98
|
+
return SockClient(sock)
|
99
|
+
|
100
|
+
@override
|
101
|
+
def _as_env_string(self):
|
102
|
+
return "-".join(
|
103
|
+
(
|
104
|
+
_CURRENT_VERSION,
|
105
|
+
str(self._parent_pid),
|
106
|
+
"unix",
|
107
|
+
str(self._path),
|
108
|
+
)
|
109
|
+
)
|
110
|
+
|
111
|
+
@staticmethod
|
112
|
+
def from_env_string(token: str) -> UnixServiceToken | None:
|
113
|
+
"""Returns a Unix service token parsed from the env var."""
|
114
|
+
match = _UNIX_TOKEN_RE.fullmatch(token)
|
115
|
+
if not match:
|
116
|
+
return None
|
117
|
+
|
118
|
+
parent_pid, path = match.groups()
|
119
|
+
return UnixServiceToken(parent_pid=int(parent_pid), path=path)
|
120
|
+
|
121
|
+
|
122
|
+
@final
|
123
|
+
class TCPServiceToken(ServiceToken):
|
124
|
+
"""Connects to the service using TCP over a localhost socket."""
|
125
|
+
|
126
|
+
def __init__(self, *, parent_pid: int, port: int) -> None:
|
127
|
+
self._parent_pid = parent_pid
|
128
|
+
self._port = port
|
129
|
+
|
130
|
+
@override
|
131
|
+
def connect(self) -> SockClient:
|
132
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
133
|
+
|
134
|
+
try:
|
135
|
+
# TODO: This may block indefinitely if the service is unhealthy.
|
136
|
+
sock.connect(("localhost", self._port))
|
137
|
+
except Exception as e:
|
138
|
+
raise WandbServiceConnectionError(
|
139
|
+
f"Failed to connect to service on port {self._port}",
|
140
|
+
) from e
|
141
|
+
|
142
|
+
return SockClient(sock)
|
143
|
+
|
144
|
+
@override
|
145
|
+
def _as_env_string(self):
|
146
|
+
return "-".join(
|
147
|
+
(
|
148
|
+
_CURRENT_VERSION,
|
149
|
+
str(self._parent_pid),
|
150
|
+
"tcp",
|
151
|
+
"localhost",
|
152
|
+
str(self._port),
|
153
|
+
)
|
154
|
+
)
|
155
|
+
|
156
|
+
@staticmethod
|
157
|
+
def from_env_string(token: str) -> TCPServiceToken | None:
|
158
|
+
"""Returns a TCP service token parsed from the env var."""
|
159
|
+
match = _TCP_TOKEN_RE.fullmatch(token)
|
160
|
+
if not match:
|
161
|
+
return None
|
162
|
+
|
163
|
+
parent_pid, port = match.groups()
|
164
|
+
return TCPServiceToken(parent_pid=int(parent_pid), port=int(port))
|
wandb/sdk/lib/sock_client.py
CHANGED
@@ -79,17 +79,17 @@ class SockBuffer:
|
|
79
79
|
|
80
80
|
|
81
81
|
class SockClient:
|
82
|
-
_sock: socket.socket
|
83
|
-
_sockid: str
|
84
|
-
_retry_delay: float
|
85
|
-
_lock: "threading.Lock"
|
86
|
-
_bufsize: int
|
87
|
-
_buffer: SockBuffer
|
88
|
-
|
89
82
|
# current header is magic byte "W" followed by 4 byte length of the message
|
90
83
|
HEADLEN = 1 + 4
|
91
84
|
|
92
|
-
def __init__(self) -> None:
|
85
|
+
def __init__(self, sock: socket.socket) -> None:
|
86
|
+
"""Create a SockClient.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
sock: A connected socket.
|
90
|
+
"""
|
91
|
+
self._sock = sock
|
92
|
+
|
93
93
|
# TODO: use safe uuid's (python3.7+) or emulate this
|
94
94
|
self._sockid = uuid.uuid4().hex
|
95
95
|
self._retry_delay = 0.1
|
@@ -97,10 +97,6 @@ class SockClient:
|
|
97
97
|
self._bufsize = 4096
|
98
98
|
self._buffer = SockBuffer()
|
99
99
|
|
100
|
-
def connect(self, port: int) -> None:
|
101
|
-
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
102
|
-
s.connect(("localhost", port))
|
103
|
-
self._sock = s
|
104
100
|
self._detect_bufsize()
|
105
101
|
|
106
102
|
def _detect_bufsize(self) -> None:
|
wandb/sdk/wandb_init.py
CHANGED
@@ -925,8 +925,6 @@ class _WandbInit:
|
|
925
925
|
tel.feature.flow_control_disabled = True
|
926
926
|
if settings.x_flow_control_custom:
|
927
927
|
tel.feature.flow_control_custom = True
|
928
|
-
if not settings.x_require_legacy_service:
|
929
|
-
tel.feature.core = True
|
930
928
|
if settings._shared:
|
931
929
|
wandb.termwarn(
|
932
930
|
"The `shared` mode feature is experimental and may change. "
|
@@ -1621,4 +1619,3 @@ def init( # noqa: C901
|
|
1621
1619
|
# Need to build delay into this sentry capture because our exit hooks
|
1622
1620
|
# mess with sentry's ability to send out errors before the program ends.
|
1623
1621
|
wandb._sentry.reraise(e)
|
1624
|
-
raise AssertionError() # should never get here
|