wandb 0.18.1__py3-none-win_amd64.whl → 0.18.3__py3-none-win_amd64.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +3 -3
- wandb/__init__.pyi +67 -12
- wandb/apis/internal.py +3 -0
- wandb/apis/public/api.py +128 -2
- wandb/apis/public/artifacts.py +11 -7
- wandb/apis/public/jobs.py +8 -0
- wandb/apis/public/runs.py +16 -5
- wandb/bin/nvidia_gpu_stats.exe +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/cli.py +0 -3
- wandb/errors/__init__.py +11 -40
- wandb/errors/errors.py +37 -0
- wandb/errors/warnings.py +2 -0
- wandb/integration/tensorboard/log.py +1 -1
- wandb/old/core.py +2 -80
- wandb/plot/bar.py +7 -4
- wandb/plot/confusion_matrix.py +5 -4
- wandb/plot/histogram.py +7 -4
- wandb/plot/line.py +7 -4
- wandb/proto/v3/wandb_internal_pb2.py +31 -21
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_internal_pb2.py +23 -21
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_internal_pb2.py +23 -21
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/sdk/artifacts/_validators.py +48 -3
- wandb/sdk/artifacts/artifact.py +160 -186
- wandb/sdk/artifacts/artifact_file_cache.py +13 -11
- wandb/sdk/artifacts/artifact_instance_cache.py +4 -2
- wandb/sdk/artifacts/artifact_manifest.py +13 -11
- wandb/sdk/artifacts/artifact_manifest_entry.py +24 -22
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +9 -7
- wandb/sdk/artifacts/artifact_saver.py +27 -25
- wandb/sdk/artifacts/exceptions.py +26 -25
- wandb/sdk/artifacts/storage_handler.py +11 -9
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +16 -14
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +15 -13
- wandb/sdk/artifacts/storage_handlers/http_handler.py +15 -14
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +10 -8
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +14 -12
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +19 -19
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +10 -8
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +12 -10
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +9 -7
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +31 -29
- wandb/sdk/artifacts/storage_policy.py +20 -20
- wandb/sdk/backend/backend.py +8 -26
- wandb/sdk/data_types/base_types/wb_value.py +1 -3
- wandb/sdk/data_types/video.py +2 -2
- wandb/sdk/interface/interface.py +0 -24
- wandb/sdk/interface/interface_shared.py +0 -12
- wandb/sdk/internal/handler.py +0 -10
- wandb/sdk/internal/internal_api.py +71 -0
- wandb/sdk/internal/sender.py +0 -43
- wandb/sdk/internal/tb_watcher.py +1 -1
- wandb/sdk/lib/_settings_toposort_generated.py +1 -0
- wandb/sdk/lib/hashutil.py +34 -12
- wandb/sdk/lib/service_connection.py +216 -0
- wandb/sdk/lib/service_token.py +94 -0
- wandb/sdk/lib/sock_client.py +7 -3
- wandb/sdk/service/server.py +2 -5
- wandb/sdk/service/service.py +0 -22
- wandb/sdk/wandb_init.py +33 -22
- wandb/sdk/wandb_run.py +45 -33
- wandb/sdk/wandb_settings.py +2 -0
- wandb/sdk/wandb_setup.py +25 -16
- wandb/sdk/wandb_sync.py +9 -3
- wandb/sdk/wandb_watch.py +31 -15
- wandb/util.py +8 -1
- {wandb-0.18.1.dist-info → wandb-0.18.3.dist-info}/METADATA +3 -2
- {wandb-0.18.1.dist-info → wandb-0.18.3.dist-info}/RECORD +75 -74
- wandb/sdk/internal/update.py +0 -113
- wandb/sdk/service/service_base.py +0 -50
- wandb/sdk/service/service_sock.py +0 -70
- wandb/sdk/wandb_manager.py +0 -232
- /wandb/{sdk/lib → plot}/viz.py +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.3.dist-info}/WHEEL +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.3.dist-info}/entry_points.txt +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.3.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/lib/hashutil.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import base64
|
2
4
|
import hashlib
|
3
5
|
import mmap
|
4
|
-
import os
|
5
6
|
import sys
|
6
|
-
from
|
7
|
-
from typing import NewType, Union
|
7
|
+
from typing import TYPE_CHECKING, NewType
|
8
8
|
|
9
9
|
from wandb.sdk.lib.paths import StrPath
|
10
10
|
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
import _hashlib # type: ignore[import-not-found]
|
13
|
+
|
11
14
|
ETag = NewType("ETag", str)
|
12
15
|
HexMD5 = NewType("HexMD5", str)
|
13
16
|
B64MD5 = NewType("B64MD5", str)
|
14
17
|
|
15
18
|
|
16
|
-
def _md5(data: bytes = b"") ->
|
19
|
+
def _md5(data: bytes = b"") -> _hashlib.HASH:
|
17
20
|
"""Allow FIPS-compliant md5 hash when supported."""
|
18
21
|
if sys.version_info >= (3, 9):
|
19
22
|
return hashlib.md5(data, usedforsecurity=False)
|
@@ -25,7 +28,7 @@ def md5_string(string: str) -> B64MD5:
|
|
25
28
|
return _b64_from_hasher(_md5(string.encode("utf-8")))
|
26
29
|
|
27
30
|
|
28
|
-
def _b64_from_hasher(hasher:
|
31
|
+
def _b64_from_hasher(hasher: _hashlib.HASH) -> B64MD5:
|
29
32
|
return B64MD5(base64.b64encode(hasher.digest()).decode("ascii"))
|
30
33
|
|
31
34
|
|
@@ -33,7 +36,7 @@ def b64_to_hex_id(string: B64MD5) -> HexMD5:
|
|
33
36
|
return HexMD5(base64.standard_b64decode(string).hex())
|
34
37
|
|
35
38
|
|
36
|
-
def hex_to_b64_id(encoded_string:
|
39
|
+
def hex_to_b64_id(encoded_string: str | bytes) -> B64MD5:
|
37
40
|
if isinstance(encoded_string, bytes):
|
38
41
|
encoded_string = encoded_string.decode("utf-8")
|
39
42
|
as_str = bytes.fromhex(encoded_string)
|
@@ -48,15 +51,34 @@ def md5_file_hex(*paths: StrPath) -> HexMD5:
|
|
48
51
|
return HexMD5(_md5_file_hasher(*paths).hexdigest())
|
49
52
|
|
50
53
|
|
51
|
-
|
54
|
+
_KB: int = 1_024
|
55
|
+
_CHUNKSIZE: int = 128 * _KB
|
56
|
+
"""Chunk size (in bytes) for iteratively reading from file, if needed."""
|
57
|
+
|
58
|
+
|
59
|
+
def _md5_file_hasher(*paths: StrPath) -> _hashlib.HASH:
|
52
60
|
md5_hash = _md5()
|
53
61
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
else:
|
62
|
+
# Note: We use str paths (instead of pathlib.Path objs) for minor perf improvements.
|
63
|
+
for path in sorted(map(str, paths)):
|
64
|
+
with open(path, "rb") as f:
|
65
|
+
try:
|
59
66
|
with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mview:
|
60
67
|
md5_hash.update(mview)
|
68
|
+
except OSError:
|
69
|
+
# This occurs if the mmap-ed file is on a different/mounted filesystem,
|
70
|
+
# so we'll fall back on a less performant implementation.
|
71
|
+
|
72
|
+
# Note: At the time of implementation, the walrus operator `:=`
|
73
|
+
# is avoided to maintain support for users on python 3.7.
|
74
|
+
# Consider revisiting once 3.7 support is no longer needed.
|
75
|
+
chunk = f.read(_CHUNKSIZE)
|
76
|
+
while chunk:
|
77
|
+
md5_hash.update(chunk)
|
78
|
+
chunk = f.read(_CHUNKSIZE)
|
79
|
+
except ValueError:
|
80
|
+
# This occurs when mmap-ing an empty file, which can be skipped.
|
81
|
+
# See: https://github.com/python/cpython/blob/986a4e1b6fcae7fe7a1d0a26aea446107dd58dd2/Modules/mmapmodule.c#L1589
|
82
|
+
pass
|
61
83
|
|
62
84
|
return md5_hash
|
@@ -0,0 +1,216 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import atexit
|
4
|
+
import os
|
5
|
+
from typing import Callable
|
6
|
+
|
7
|
+
from wandb.proto import wandb_internal_pb2 as pb
|
8
|
+
from wandb.proto import wandb_server_pb2 as spb
|
9
|
+
from wandb.proto import wandb_settings_pb2
|
10
|
+
from wandb.sdk import wandb_settings
|
11
|
+
from wandb.sdk.interface.interface import InterfaceBase
|
12
|
+
from wandb.sdk.interface.interface_sock import InterfaceSock
|
13
|
+
from wandb.sdk.lib import service_token
|
14
|
+
from wandb.sdk.lib.exit_hooks import ExitHooks
|
15
|
+
from wandb.sdk.lib.mailbox import Mailbox
|
16
|
+
from wandb.sdk.lib.sock_client import SockClient, SockClientTimeoutError
|
17
|
+
from wandb.sdk.service import service
|
18
|
+
|
19
|
+
|
20
|
+
class WandbServiceNotOwnedError(Exception):
|
21
|
+
"""Raised when the current process does not own the service process."""
|
22
|
+
|
23
|
+
|
24
|
+
class WandbServiceConnectionError(Exception):
|
25
|
+
"""Raised on failure to connect to the service process."""
|
26
|
+
|
27
|
+
|
28
|
+
class WandbAttachFailedError(Exception):
|
29
|
+
"""Raised if attaching to a run fails."""
|
30
|
+
|
31
|
+
|
32
|
+
def connect_to_service(
|
33
|
+
settings: wandb_settings.Settings,
|
34
|
+
) -> ServiceConnection:
|
35
|
+
"""Connects to the service process, starting one up if necessary."""
|
36
|
+
conn = _try_connect_to_existing_service()
|
37
|
+
if conn:
|
38
|
+
return conn
|
39
|
+
|
40
|
+
return _start_and_connect_service(settings)
|
41
|
+
|
42
|
+
|
43
|
+
def _try_connect_to_existing_service() -> ServiceConnection | None:
|
44
|
+
"""Attemps to connect to an existing service process."""
|
45
|
+
token = service_token.get_service_token()
|
46
|
+
if not token:
|
47
|
+
return None
|
48
|
+
|
49
|
+
# Only localhost sockets are supported below.
|
50
|
+
assert token.host == "localhost"
|
51
|
+
client = SockClient()
|
52
|
+
|
53
|
+
try:
|
54
|
+
# TODO: This may block indefinitely if the service is unhealthy.
|
55
|
+
client.connect(token.port)
|
56
|
+
|
57
|
+
except Exception as e:
|
58
|
+
raise WandbServiceConnectionError(
|
59
|
+
"Failed to connect to internal service."
|
60
|
+
) from e
|
61
|
+
|
62
|
+
return ServiceConnection(client=client, proc=None)
|
63
|
+
|
64
|
+
|
65
|
+
def _start_and_connect_service(
|
66
|
+
settings: wandb_settings.Settings,
|
67
|
+
) -> ServiceConnection:
|
68
|
+
"""Starts a service process and returns a connection to it.
|
69
|
+
|
70
|
+
An atexit hook is registered to tear down the service process and wait for
|
71
|
+
it to complete. The hook does not run in processes started using the
|
72
|
+
multiprocessing module.
|
73
|
+
"""
|
74
|
+
proc = service._Service(settings)
|
75
|
+
proc.start()
|
76
|
+
|
77
|
+
port = proc.sock_port
|
78
|
+
assert port
|
79
|
+
client = SockClient()
|
80
|
+
client.connect(port)
|
81
|
+
|
82
|
+
service_token.set_service_token(
|
83
|
+
parent_pid=os.getpid(),
|
84
|
+
transport="tcp",
|
85
|
+
host="localhost",
|
86
|
+
port=port,
|
87
|
+
)
|
88
|
+
|
89
|
+
hooks = ExitHooks()
|
90
|
+
hooks.hook()
|
91
|
+
|
92
|
+
def teardown_atexit():
|
93
|
+
conn.teardown(hooks.exit_code)
|
94
|
+
|
95
|
+
conn = ServiceConnection(
|
96
|
+
client=client,
|
97
|
+
proc=proc,
|
98
|
+
cleanup=lambda: atexit.unregister(teardown_atexit),
|
99
|
+
)
|
100
|
+
|
101
|
+
atexit.register(teardown_atexit)
|
102
|
+
|
103
|
+
return conn
|
104
|
+
|
105
|
+
|
106
|
+
class ServiceConnection:
|
107
|
+
"""A connection to the W&B internal service process."""
|
108
|
+
|
109
|
+
def __init__(
|
110
|
+
self,
|
111
|
+
client: SockClient,
|
112
|
+
proc: service._Service | None,
|
113
|
+
cleanup: Callable[[], None] | None = None,
|
114
|
+
):
|
115
|
+
"""Returns a new ServiceConnection.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
client: A socket that's connected to the service.
|
119
|
+
proc: The service process if we own it, or None otherwise.
|
120
|
+
cleanup: A callback to run on teardown before doing anything.
|
121
|
+
"""
|
122
|
+
self._client = client
|
123
|
+
self._proc = proc
|
124
|
+
self._torn_down = False
|
125
|
+
self._cleanup = cleanup
|
126
|
+
|
127
|
+
def make_interface(self, mailbox: Mailbox) -> InterfaceBase:
|
128
|
+
"""Returns an interface for communicating with the service."""
|
129
|
+
return InterfaceSock(self._client, mailbox)
|
130
|
+
|
131
|
+
def send_record(self, record: pb.Record) -> None:
|
132
|
+
"""Sends data to the service."""
|
133
|
+
self._client.send_record_publish(record)
|
134
|
+
|
135
|
+
def inform_init(
|
136
|
+
self,
|
137
|
+
settings: wandb_settings_pb2.Settings,
|
138
|
+
run_id: str,
|
139
|
+
) -> None:
|
140
|
+
"""Sends an init request to the service."""
|
141
|
+
request = spb.ServerInformInitRequest()
|
142
|
+
request.settings.CopyFrom(settings)
|
143
|
+
request._info.stream_id = run_id
|
144
|
+
self._client.send(inform_init=request)
|
145
|
+
|
146
|
+
def inform_finish(self, run_id: str) -> None:
|
147
|
+
"""Sends an finish request to the service."""
|
148
|
+
request = spb.ServerInformFinishRequest()
|
149
|
+
request._info.stream_id = run_id
|
150
|
+
self._client.send(inform_finish=request)
|
151
|
+
|
152
|
+
def inform_attach(
|
153
|
+
self,
|
154
|
+
attach_id: str,
|
155
|
+
) -> wandb_settings_pb2.Settings:
|
156
|
+
"""Sends an attach request to the service.
|
157
|
+
|
158
|
+
Raises a WandbAttachFailedError if attaching is not possible.
|
159
|
+
"""
|
160
|
+
request = spb.ServerInformAttachRequest()
|
161
|
+
request._info.stream_id = attach_id
|
162
|
+
|
163
|
+
try:
|
164
|
+
response = self._client.send_and_recv(inform_attach=request)
|
165
|
+
return response.inform_attach_response.settings
|
166
|
+
except SockClientTimeoutError:
|
167
|
+
raise WandbAttachFailedError(
|
168
|
+
"Could not attach because the run does not belong to"
|
169
|
+
" the current service process, or because the service"
|
170
|
+
" process is busy (unlikely)."
|
171
|
+
)
|
172
|
+
|
173
|
+
def inform_start(
|
174
|
+
self,
|
175
|
+
settings: wandb_settings_pb2.Settings,
|
176
|
+
run_id: str,
|
177
|
+
) -> None:
|
178
|
+
"""Sends a start request to the service."""
|
179
|
+
request = spb.ServerInformStartRequest()
|
180
|
+
request.settings.CopyFrom(settings)
|
181
|
+
request._info.stream_id = run_id
|
182
|
+
self._client.send(inform_start=request)
|
183
|
+
|
184
|
+
def teardown(self, exit_code: int) -> int:
|
185
|
+
"""Shuts down the service process and returns its exit code.
|
186
|
+
|
187
|
+
This may only be called once.
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
The exit code of the service process.
|
191
|
+
|
192
|
+
Raises:
|
193
|
+
WandbServiceNotOwnedError: If the current process did not start
|
194
|
+
the service process.
|
195
|
+
"""
|
196
|
+
if not self._proc:
|
197
|
+
raise WandbServiceNotOwnedError(
|
198
|
+
"Cannot tear down service started by different process",
|
199
|
+
)
|
200
|
+
|
201
|
+
assert not self._torn_down
|
202
|
+
self._torn_down = True
|
203
|
+
|
204
|
+
if self._cleanup:
|
205
|
+
self._cleanup()
|
206
|
+
|
207
|
+
# Clear the service token to prevent new connections from being made.
|
208
|
+
service_token.clear_service_token()
|
209
|
+
|
210
|
+
self._client.send(
|
211
|
+
inform_teardown=spb.ServerInformTeardownRequest(
|
212
|
+
exit_code=exit_code,
|
213
|
+
)
|
214
|
+
)
|
215
|
+
|
216
|
+
return self._proc.join()
|
@@ -0,0 +1,94 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import dataclasses
|
4
|
+
import os
|
5
|
+
|
6
|
+
from wandb import env
|
7
|
+
|
8
|
+
_CURRENT_VERSION = "2"
|
9
|
+
_SUPPORTED_TRANSPORTS = "tcp"
|
10
|
+
|
11
|
+
|
12
|
+
def get_service_token() -> ServiceToken | None:
|
13
|
+
"""Reads the token from environment variables.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
The token if the correct environment variable is set, or None.
|
17
|
+
|
18
|
+
Raises:
|
19
|
+
ValueError: If the environment variable is set but cannot be
|
20
|
+
parsed.
|
21
|
+
"""
|
22
|
+
token = os.environ.get(env.SERVICE)
|
23
|
+
if not token:
|
24
|
+
return None
|
25
|
+
|
26
|
+
parts = token.split("-")
|
27
|
+
if len(parts) != 5:
|
28
|
+
raise ValueError(f"Invalid token: {token}")
|
29
|
+
|
30
|
+
version, pid_str, transport, host, port_str = parts
|
31
|
+
|
32
|
+
if version != _CURRENT_VERSION:
|
33
|
+
raise ValueError(
|
34
|
+
f"Expected version {_CURRENT_VERSION},"
|
35
|
+
f" but got {version} (token={token})"
|
36
|
+
)
|
37
|
+
if transport not in _SUPPORTED_TRANSPORTS:
|
38
|
+
raise ValueError(
|
39
|
+
f"Unsupported transport: {transport} (token={token})",
|
40
|
+
)
|
41
|
+
|
42
|
+
try:
|
43
|
+
return ServiceToken(
|
44
|
+
version=version,
|
45
|
+
pid=int(pid_str),
|
46
|
+
transport=transport,
|
47
|
+
host=host,
|
48
|
+
port=int(port_str),
|
49
|
+
)
|
50
|
+
except ValueError as e:
|
51
|
+
raise ValueError(f"Invalid token: {token}") from e
|
52
|
+
|
53
|
+
|
54
|
+
def set_service_token(parent_pid: int, transport: str, host: str, port: int) -> None:
|
55
|
+
"""Stores a service token in an environment variable.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
parent_pid: The process ID of the process that started the service.
|
59
|
+
transport: The transport used to communicate with the service.
|
60
|
+
host: The host part of the internet address on which the service
|
61
|
+
is listening (e.g. localhost).
|
62
|
+
port: The port the service is listening on.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
ValueError: If given an unsupported transport.
|
66
|
+
"""
|
67
|
+
if transport not in _SUPPORTED_TRANSPORTS:
|
68
|
+
raise ValueError(f"Unsupported transport: {transport}")
|
69
|
+
|
70
|
+
os.environ[env.SERVICE] = "-".join(
|
71
|
+
(
|
72
|
+
_CURRENT_VERSION,
|
73
|
+
str(parent_pid),
|
74
|
+
transport,
|
75
|
+
host,
|
76
|
+
str(port),
|
77
|
+
)
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
def clear_service_token() -> None:
|
82
|
+
"""Clears the environment variable storing the service token."""
|
83
|
+
os.environ.pop(env.SERVICE, None)
|
84
|
+
|
85
|
+
|
86
|
+
@dataclasses.dataclass(frozen=True)
|
87
|
+
class ServiceToken:
|
88
|
+
"""An identifier for a running service process."""
|
89
|
+
|
90
|
+
version: str
|
91
|
+
pid: int
|
92
|
+
transport: str
|
93
|
+
host: str
|
94
|
+
port: int
|
wandb/sdk/lib/sock_client.py
CHANGED
@@ -14,9 +14,11 @@ if TYPE_CHECKING:
|
|
14
14
|
|
15
15
|
|
16
16
|
class SockClientClosedError(Exception):
|
17
|
-
"""
|
17
|
+
"""Raised on operations on a closed socket."""
|
18
18
|
|
19
|
-
|
19
|
+
|
20
|
+
class SockClientTimeoutError(Exception):
|
21
|
+
"""Raised if the server didn't respond before the timeout."""
|
20
22
|
|
21
23
|
|
22
24
|
class SockBuffer:
|
@@ -182,8 +184,10 @@ class SockClient:
|
|
182
184
|
# it should be relatively stable.
|
183
185
|
# This pass would be solved as part of the fix in https://wandb.atlassian.net/browse/WB-8709
|
184
186
|
response = self.read_server_response(timeout=1)
|
187
|
+
|
185
188
|
if response is None:
|
186
|
-
raise
|
189
|
+
raise SockClientTimeoutError("No response after 1 second.")
|
190
|
+
|
187
191
|
return response
|
188
192
|
|
189
193
|
def send(
|
wandb/sdk/service/server.py
CHANGED
@@ -20,7 +20,6 @@ class WandbServer:
|
|
20
20
|
_pid: Optional[int]
|
21
21
|
_sock_port: Optional[int]
|
22
22
|
_debug: bool
|
23
|
-
_serve_sock: bool
|
24
23
|
_sock_server: Optional[SocketServer]
|
25
24
|
_startup_debug_enabled: bool
|
26
25
|
|
@@ -31,14 +30,12 @@ class WandbServer:
|
|
31
30
|
address: Optional[str] = None,
|
32
31
|
pid: Optional[int] = None,
|
33
32
|
debug: bool = True,
|
34
|
-
serve_sock: bool = False,
|
35
33
|
) -> None:
|
36
34
|
self._sock_port = sock_port
|
37
35
|
self._port_fname = port_fname
|
38
36
|
self._address = address
|
39
37
|
self._pid = pid
|
40
38
|
self._debug = debug
|
41
|
-
self._serve_sock = serve_sock
|
42
39
|
self._sock_server = None
|
43
40
|
self._startup_debug_enabled = _startup_debug.is_enabled()
|
44
41
|
|
@@ -97,7 +94,7 @@ class WandbServer:
|
|
97
94
|
pid = str(self._pid or 0)
|
98
95
|
transport = "s" if sock_port else "g"
|
99
96
|
port = sock_port or 0
|
100
|
-
# this format is similar to
|
97
|
+
# this format is similar to the service token, but it's purely informative now
|
101
98
|
# (consider unifying this in the future)
|
102
99
|
service_id = f"{service_ver}-{pid}-{transport}-{port}"
|
103
100
|
proc_title = f"wandb-service({service_id})"
|
@@ -109,7 +106,7 @@ class WandbServer:
|
|
109
106
|
self._setup_tracelog()
|
110
107
|
mux = StreamMux()
|
111
108
|
self._startup_debug_print("before_network")
|
112
|
-
sock_port = self._start_sock(mux=mux)
|
109
|
+
sock_port = self._start_sock(mux=mux)
|
113
110
|
self._startup_debug_print("after_network")
|
114
111
|
self._inform_used_ports(sock_port=sock_port)
|
115
112
|
self._startup_debug_print("after_inform")
|
wandb/sdk/service/service.py
CHANGED
@@ -21,8 +21,6 @@ from wandb.sdk.lib.wburls import wburls
|
|
21
21
|
from wandb.util import get_core_path, get_module
|
22
22
|
|
23
23
|
from . import _startup_debug, port_file
|
24
|
-
from .service_base import ServiceInterface
|
25
|
-
from .service_sock import ServiceSockInterface
|
26
24
|
|
27
25
|
if TYPE_CHECKING:
|
28
26
|
from wandb.sdk.wandb_settings import Settings
|
@@ -31,25 +29,18 @@ if TYPE_CHECKING:
|
|
31
29
|
class ServiceStartProcessError(Error):
|
32
30
|
"""Raised when a known error occurs when launching wandb service."""
|
33
31
|
|
34
|
-
pass
|
35
|
-
|
36
32
|
|
37
33
|
class ServiceStartTimeoutError(Error):
|
38
34
|
"""Raised when service start times out."""
|
39
35
|
|
40
|
-
pass
|
41
|
-
|
42
36
|
|
43
37
|
class ServiceStartPortError(Error):
|
44
38
|
"""Raised when service start fails to find a port."""
|
45
39
|
|
46
|
-
pass
|
47
|
-
|
48
40
|
|
49
41
|
class _Service:
|
50
42
|
_settings: "Settings"
|
51
43
|
_sock_port: Optional[int]
|
52
|
-
_service_interface: ServiceInterface
|
53
44
|
_internal_proc: Optional[subprocess.Popen]
|
54
45
|
_startup_debug_enabled: bool
|
55
46
|
|
@@ -65,10 +56,6 @@ class _Service:
|
|
65
56
|
|
66
57
|
_sentry.configure_scope(tags=dict(settings), process_context="service")
|
67
58
|
|
68
|
-
# current code only supports socket server implementation, in the
|
69
|
-
# future we might be able to support both
|
70
|
-
self._service_interface = ServiceSockInterface()
|
71
|
-
|
72
59
|
def _startup_debug_print(self, message: str) -> None:
|
73
60
|
if not self._startup_debug_enabled:
|
74
61
|
return
|
@@ -175,10 +162,6 @@ class _Service:
|
|
175
162
|
if core_debug(default="False"):
|
176
163
|
service_args.append("--debug")
|
177
164
|
|
178
|
-
trace_filename = os.environ.get("_WANDB_TRACE")
|
179
|
-
if trace_filename is not None:
|
180
|
-
service_args.extend(["--trace", trace_filename])
|
181
|
-
|
182
165
|
exec_cmd_list = []
|
183
166
|
termlog(
|
184
167
|
"Using wandb-core as the SDK backend."
|
@@ -194,7 +177,6 @@ class _Service:
|
|
194
177
|
"--pid",
|
195
178
|
pid,
|
196
179
|
]
|
197
|
-
service_args.append("--serve-sock")
|
198
180
|
|
199
181
|
if os.environ.get("WANDB_SERVICE_PROFILE") == "memray":
|
200
182
|
_ = get_module(
|
@@ -253,10 +235,6 @@ class _Service:
|
|
253
235
|
def sock_port(self) -> Optional[int]:
|
254
236
|
return self._sock_port
|
255
237
|
|
256
|
-
@property
|
257
|
-
def service_interface(self) -> ServiceInterface:
|
258
|
-
return self._service_interface
|
259
|
-
|
260
238
|
def join(self) -> int:
|
261
239
|
ret = 0
|
262
240
|
if self._internal_proc:
|
wandb/sdk/wandb_init.py
CHANGED
@@ -578,6 +578,7 @@ class _WandbInit:
|
|
578
578
|
):
|
579
579
|
setattr(drun, symbol, lambda *_, **__: None) # type: ignore
|
580
580
|
# attributes
|
581
|
+
drun._backend = None
|
581
582
|
drun._step = 0
|
582
583
|
drun._attach_id = None
|
583
584
|
drun._run_obj = None
|
@@ -655,9 +656,9 @@ class _WandbInit:
|
|
655
656
|
f"Successfully finished last run (ID:{latest_run._run_id}). Initializing new run:<br/>"
|
656
657
|
)
|
657
658
|
elif isinstance(wandb.run, Run):
|
658
|
-
|
659
|
+
service = self._wl.service
|
659
660
|
# We shouldn't return a stale global run if we are in a new pid
|
660
|
-
if not
|
661
|
+
if not service or os.getpid() == wandb.run._init_pid:
|
661
662
|
logger.info("wandb.init() called when a run is still active")
|
662
663
|
with telemetry.context() as tel:
|
663
664
|
tel.feature.init_return_run = True
|
@@ -665,15 +666,20 @@ class _WandbInit:
|
|
665
666
|
|
666
667
|
logger.info("starting backend")
|
667
668
|
|
668
|
-
|
669
|
-
if
|
670
|
-
logger.info("
|
671
|
-
|
672
|
-
settings=self.settings.to_proto(),
|
669
|
+
service = self._wl.service
|
670
|
+
if service:
|
671
|
+
logger.info("sending inform_init request")
|
672
|
+
service.inform_init(
|
673
|
+
settings=self.settings.to_proto(),
|
674
|
+
run_id=self.settings.run_id,
|
673
675
|
)
|
674
676
|
|
675
677
|
mailbox = Mailbox()
|
676
|
-
backend = Backend(
|
678
|
+
backend = Backend(
|
679
|
+
settings=self.settings,
|
680
|
+
service=service,
|
681
|
+
mailbox=mailbox,
|
682
|
+
)
|
677
683
|
backend.ensure_launched()
|
678
684
|
logger.info("backend started and connected")
|
679
685
|
# Make sure we are logged in
|
@@ -739,7 +745,7 @@ class _WandbInit:
|
|
739
745
|
if os.environ.get(wandb.env._DISABLE_SERVICE):
|
740
746
|
tel.feature.service_disabled = True
|
741
747
|
|
742
|
-
if
|
748
|
+
if service:
|
743
749
|
tel.feature.service = True
|
744
750
|
if self.settings._flow_control_disabled:
|
745
751
|
tel.feature.flow_control_disabled = True
|
@@ -830,7 +836,7 @@ class _WandbInit:
|
|
830
836
|
|
831
837
|
if error is not None:
|
832
838
|
logger.error(f"encountered error: {error}")
|
833
|
-
if not
|
839
|
+
if not service:
|
834
840
|
# Shutdown the backend and get rid of the logger
|
835
841
|
# we don't need to do console cleanup at this point
|
836
842
|
backend.cleanup()
|
@@ -857,9 +863,10 @@ class _WandbInit:
|
|
857
863
|
logger.info("starting run threads in backend")
|
858
864
|
# initiate run (stats and metadata probing)
|
859
865
|
|
860
|
-
if
|
861
|
-
|
862
|
-
settings=self.settings.to_proto(),
|
866
|
+
if service:
|
867
|
+
service.inform_start(
|
868
|
+
settings=self.settings.to_proto(),
|
869
|
+
run_id=self.settings.run_id,
|
863
870
|
)
|
864
871
|
|
865
872
|
assert backend.interface
|
@@ -934,33 +941,37 @@ def _attach(
|
|
934
941
|
if logger is None:
|
935
942
|
raise UsageError("logger is not initialized")
|
936
943
|
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
944
|
+
service = _wl.service
|
945
|
+
if not service:
|
946
|
+
raise UsageError(f"Unable to attach to run {attach_id} (no service process)")
|
947
|
+
|
948
|
+
try:
|
949
|
+
attach_settings = service.inform_attach(attach_id=attach_id)
|
950
|
+
except Exception as e:
|
951
|
+
raise UsageError(f"Unable to attach to run {attach_id}") from e
|
941
952
|
|
942
953
|
settings: Settings = copy.copy(_wl._settings)
|
943
954
|
|
944
955
|
settings.update(
|
945
956
|
{
|
946
957
|
"run_id": attach_id,
|
947
|
-
"_start_time":
|
948
|
-
"_start_datetime":
|
949
|
-
"_offline":
|
958
|
+
"_start_time": attach_settings._start_time.value,
|
959
|
+
"_start_datetime": attach_settings._start_datetime.value,
|
960
|
+
"_offline": attach_settings._offline.value,
|
950
961
|
},
|
951
962
|
source=Source.INIT,
|
952
963
|
)
|
953
964
|
|
954
965
|
# TODO: consolidate this codepath with wandb.init()
|
955
966
|
mailbox = Mailbox()
|
956
|
-
backend = Backend(settings=settings,
|
967
|
+
backend = Backend(settings=settings, service=service, mailbox=mailbox)
|
957
968
|
backend.ensure_launched()
|
958
969
|
logger.info("attach backend started and connected")
|
959
970
|
|
960
971
|
if run is None:
|
961
972
|
run = Run(settings=settings)
|
962
973
|
else:
|
963
|
-
run._init()
|
974
|
+
run._init(settings=settings)
|
964
975
|
run._set_library(_wl)
|
965
976
|
run._set_backend(backend)
|
966
977
|
backend._hack_set_run(run)
|