wandb 0.18.1__py3-none-win32.whl → 0.18.2__py3-none-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -3
- wandb/__init__.pyi +67 -12
- wandb/apis/internal.py +3 -0
- wandb/apis/public/api.py +128 -2
- wandb/apis/public/artifacts.py +11 -7
- wandb/apis/public/jobs.py +8 -0
- wandb/apis/public/runs.py +16 -5
- wandb/bin/wandb-core +0 -0
- wandb/cli/cli.py +0 -3
- wandb/errors/__init__.py +11 -40
- wandb/errors/errors.py +37 -0
- wandb/errors/warnings.py +2 -0
- wandb/integration/tensorboard/log.py +1 -1
- wandb/old/core.py +2 -80
- wandb/plot/bar.py +7 -4
- wandb/plot/confusion_matrix.py +5 -4
- wandb/plot/histogram.py +7 -4
- wandb/plot/line.py +7 -4
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/sdk/artifacts/_validators.py +48 -3
- wandb/sdk/artifacts/artifact.py +157 -183
- wandb/sdk/artifacts/artifact_file_cache.py +13 -11
- wandb/sdk/artifacts/artifact_instance_cache.py +4 -2
- wandb/sdk/artifacts/artifact_manifest.py +13 -11
- wandb/sdk/artifacts/artifact_manifest_entry.py +24 -22
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +9 -7
- wandb/sdk/artifacts/artifact_saver.py +27 -25
- wandb/sdk/artifacts/exceptions.py +26 -25
- wandb/sdk/artifacts/storage_handler.py +11 -9
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +16 -14
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +15 -13
- wandb/sdk/artifacts/storage_handlers/http_handler.py +15 -14
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +10 -8
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +14 -12
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +19 -19
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +10 -8
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +12 -10
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +9 -7
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +31 -29
- wandb/sdk/artifacts/storage_policy.py +20 -20
- wandb/sdk/backend/backend.py +8 -26
- wandb/sdk/data_types/base_types/wb_value.py +1 -3
- wandb/sdk/data_types/video.py +2 -2
- wandb/sdk/interface/interface.py +0 -24
- wandb/sdk/interface/interface_shared.py +0 -12
- wandb/sdk/internal/handler.py +0 -10
- wandb/sdk/internal/internal_api.py +71 -0
- wandb/sdk/internal/sender.py +0 -43
- wandb/sdk/internal/tb_watcher.py +1 -1
- wandb/sdk/lib/_settings_toposort_generated.py +1 -0
- wandb/sdk/lib/hashutil.py +34 -12
- wandb/sdk/lib/service_connection.py +216 -0
- wandb/sdk/lib/service_token.py +94 -0
- wandb/sdk/lib/sock_client.py +7 -3
- wandb/sdk/service/server.py +2 -5
- wandb/sdk/service/service.py +0 -22
- wandb/sdk/wandb_init.py +32 -22
- wandb/sdk/wandb_run.py +12 -7
- wandb/sdk/wandb_settings.py +2 -0
- wandb/sdk/wandb_setup.py +25 -16
- wandb/sdk/wandb_sync.py +9 -3
- wandb/sdk/wandb_watch.py +31 -15
- wandb/util.py +8 -1
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/METADATA +2 -1
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/RECORD +71 -71
- wandb/sdk/internal/update.py +0 -113
- wandb/sdk/service/service_base.py +0 -50
- wandb/sdk/service/service_sock.py +0 -70
- wandb/sdk/wandb_manager.py +0 -232
- /wandb/{sdk/lib → plot}/viz.py +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/WHEEL +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/entry_points.txt +0 -0
- {wandb-0.18.1.dist-info → wandb-0.18.2.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/lib/hashutil.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import base64
|
2
4
|
import hashlib
|
3
5
|
import mmap
|
4
|
-
import os
|
5
6
|
import sys
|
6
|
-
from
|
7
|
-
from typing import NewType, Union
|
7
|
+
from typing import TYPE_CHECKING, NewType
|
8
8
|
|
9
9
|
from wandb.sdk.lib.paths import StrPath
|
10
10
|
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
import _hashlib # type: ignore[import-not-found]
|
13
|
+
|
11
14
|
ETag = NewType("ETag", str)
|
12
15
|
HexMD5 = NewType("HexMD5", str)
|
13
16
|
B64MD5 = NewType("B64MD5", str)
|
14
17
|
|
15
18
|
|
16
|
-
def _md5(data: bytes = b"") ->
|
19
|
+
def _md5(data: bytes = b"") -> _hashlib.HASH:
|
17
20
|
"""Allow FIPS-compliant md5 hash when supported."""
|
18
21
|
if sys.version_info >= (3, 9):
|
19
22
|
return hashlib.md5(data, usedforsecurity=False)
|
@@ -25,7 +28,7 @@ def md5_string(string: str) -> B64MD5:
|
|
25
28
|
return _b64_from_hasher(_md5(string.encode("utf-8")))
|
26
29
|
|
27
30
|
|
28
|
-
def _b64_from_hasher(hasher:
|
31
|
+
def _b64_from_hasher(hasher: _hashlib.HASH) -> B64MD5:
|
29
32
|
return B64MD5(base64.b64encode(hasher.digest()).decode("ascii"))
|
30
33
|
|
31
34
|
|
@@ -33,7 +36,7 @@ def b64_to_hex_id(string: B64MD5) -> HexMD5:
|
|
33
36
|
return HexMD5(base64.standard_b64decode(string).hex())
|
34
37
|
|
35
38
|
|
36
|
-
def hex_to_b64_id(encoded_string:
|
39
|
+
def hex_to_b64_id(encoded_string: str | bytes) -> B64MD5:
|
37
40
|
if isinstance(encoded_string, bytes):
|
38
41
|
encoded_string = encoded_string.decode("utf-8")
|
39
42
|
as_str = bytes.fromhex(encoded_string)
|
@@ -48,15 +51,34 @@ def md5_file_hex(*paths: StrPath) -> HexMD5:
|
|
48
51
|
return HexMD5(_md5_file_hasher(*paths).hexdigest())
|
49
52
|
|
50
53
|
|
51
|
-
|
54
|
+
_KB: int = 1_024
|
55
|
+
_CHUNKSIZE: int = 128 * _KB
|
56
|
+
"""Chunk size (in bytes) for iteratively reading from file, if needed."""
|
57
|
+
|
58
|
+
|
59
|
+
def _md5_file_hasher(*paths: StrPath) -> _hashlib.HASH:
|
52
60
|
md5_hash = _md5()
|
53
61
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
else:
|
62
|
+
# Note: We use str paths (instead of pathlib.Path objs) for minor perf improvements.
|
63
|
+
for path in sorted(map(str, paths)):
|
64
|
+
with open(path, "rb") as f:
|
65
|
+
try:
|
59
66
|
with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mview:
|
60
67
|
md5_hash.update(mview)
|
68
|
+
except OSError:
|
69
|
+
# This occurs if the mmap-ed file is on a different/mounted filesystem,
|
70
|
+
# so we'll fall back on a less performant implementation.
|
71
|
+
|
72
|
+
# Note: At the time of implementation, the walrus operator `:=`
|
73
|
+
# is avoided to maintain support for users on python 3.7.
|
74
|
+
# Consider revisiting once 3.7 support is no longer needed.
|
75
|
+
chunk = f.read(_CHUNKSIZE)
|
76
|
+
while chunk:
|
77
|
+
md5_hash.update(chunk)
|
78
|
+
chunk = f.read(_CHUNKSIZE)
|
79
|
+
except ValueError:
|
80
|
+
# This occurs when mmap-ing an empty file, which can be skipped.
|
81
|
+
# See: https://github.com/python/cpython/blob/986a4e1b6fcae7fe7a1d0a26aea446107dd58dd2/Modules/mmapmodule.c#L1589
|
82
|
+
pass
|
61
83
|
|
62
84
|
return md5_hash
|
@@ -0,0 +1,216 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import atexit
|
4
|
+
import os
|
5
|
+
from typing import Callable
|
6
|
+
|
7
|
+
from wandb.proto import wandb_internal_pb2 as pb
|
8
|
+
from wandb.proto import wandb_server_pb2 as spb
|
9
|
+
from wandb.proto import wandb_settings_pb2
|
10
|
+
from wandb.sdk import wandb_settings
|
11
|
+
from wandb.sdk.interface.interface import InterfaceBase
|
12
|
+
from wandb.sdk.interface.interface_sock import InterfaceSock
|
13
|
+
from wandb.sdk.lib import service_token
|
14
|
+
from wandb.sdk.lib.exit_hooks import ExitHooks
|
15
|
+
from wandb.sdk.lib.mailbox import Mailbox
|
16
|
+
from wandb.sdk.lib.sock_client import SockClient, SockClientTimeoutError
|
17
|
+
from wandb.sdk.service import service
|
18
|
+
|
19
|
+
|
20
|
+
class WandbServiceNotOwnedError(Exception):
|
21
|
+
"""Raised when the current process does not own the service process."""
|
22
|
+
|
23
|
+
|
24
|
+
class WandbServiceConnectionError(Exception):
|
25
|
+
"""Raised on failure to connect to the service process."""
|
26
|
+
|
27
|
+
|
28
|
+
class WandbAttachFailedError(Exception):
|
29
|
+
"""Raised if attaching to a run fails."""
|
30
|
+
|
31
|
+
|
32
|
+
def connect_to_service(
|
33
|
+
settings: wandb_settings.Settings,
|
34
|
+
) -> ServiceConnection:
|
35
|
+
"""Connects to the service process, starting one up if necessary."""
|
36
|
+
conn = _try_connect_to_existing_service()
|
37
|
+
if conn:
|
38
|
+
return conn
|
39
|
+
|
40
|
+
return _start_and_connect_service(settings)
|
41
|
+
|
42
|
+
|
43
|
+
def _try_connect_to_existing_service() -> ServiceConnection | None:
|
44
|
+
"""Attemps to connect to an existing service process."""
|
45
|
+
token = service_token.get_service_token()
|
46
|
+
if not token:
|
47
|
+
return None
|
48
|
+
|
49
|
+
# Only localhost sockets are supported below.
|
50
|
+
assert token.host == "localhost"
|
51
|
+
client = SockClient()
|
52
|
+
|
53
|
+
try:
|
54
|
+
# TODO: This may block indefinitely if the service is unhealthy.
|
55
|
+
client.connect(token.port)
|
56
|
+
|
57
|
+
except Exception as e:
|
58
|
+
raise WandbServiceConnectionError(
|
59
|
+
"Failed to connect to internal service."
|
60
|
+
) from e
|
61
|
+
|
62
|
+
return ServiceConnection(client=client, proc=None)
|
63
|
+
|
64
|
+
|
65
|
+
def _start_and_connect_service(
|
66
|
+
settings: wandb_settings.Settings,
|
67
|
+
) -> ServiceConnection:
|
68
|
+
"""Starts a service process and returns a connection to it.
|
69
|
+
|
70
|
+
An atexit hook is registered to tear down the service process and wait for
|
71
|
+
it to complete. The hook does not run in processes started using the
|
72
|
+
multiprocessing module.
|
73
|
+
"""
|
74
|
+
proc = service._Service(settings)
|
75
|
+
proc.start()
|
76
|
+
|
77
|
+
port = proc.sock_port
|
78
|
+
assert port
|
79
|
+
client = SockClient()
|
80
|
+
client.connect(port)
|
81
|
+
|
82
|
+
service_token.set_service_token(
|
83
|
+
parent_pid=os.getpid(),
|
84
|
+
transport="tcp",
|
85
|
+
host="localhost",
|
86
|
+
port=port,
|
87
|
+
)
|
88
|
+
|
89
|
+
hooks = ExitHooks()
|
90
|
+
hooks.hook()
|
91
|
+
|
92
|
+
def teardown_atexit():
|
93
|
+
conn.teardown(hooks.exit_code)
|
94
|
+
|
95
|
+
conn = ServiceConnection(
|
96
|
+
client=client,
|
97
|
+
proc=proc,
|
98
|
+
cleanup=lambda: atexit.unregister(teardown_atexit),
|
99
|
+
)
|
100
|
+
|
101
|
+
atexit.register(teardown_atexit)
|
102
|
+
|
103
|
+
return conn
|
104
|
+
|
105
|
+
|
106
|
+
class ServiceConnection:
|
107
|
+
"""A connection to the W&B internal service process."""
|
108
|
+
|
109
|
+
def __init__(
|
110
|
+
self,
|
111
|
+
client: SockClient,
|
112
|
+
proc: service._Service | None,
|
113
|
+
cleanup: Callable[[], None] | None = None,
|
114
|
+
):
|
115
|
+
"""Returns a new ServiceConnection.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
client: A socket that's connected to the service.
|
119
|
+
proc: The service process if we own it, or None otherwise.
|
120
|
+
cleanup: A callback to run on teardown before doing anything.
|
121
|
+
"""
|
122
|
+
self._client = client
|
123
|
+
self._proc = proc
|
124
|
+
self._torn_down = False
|
125
|
+
self._cleanup = cleanup
|
126
|
+
|
127
|
+
def make_interface(self, mailbox: Mailbox) -> InterfaceBase:
|
128
|
+
"""Returns an interface for communicating with the service."""
|
129
|
+
return InterfaceSock(self._client, mailbox)
|
130
|
+
|
131
|
+
def send_record(self, record: pb.Record) -> None:
|
132
|
+
"""Sends data to the service."""
|
133
|
+
self._client.send_record_publish(record)
|
134
|
+
|
135
|
+
def inform_init(
|
136
|
+
self,
|
137
|
+
settings: wandb_settings_pb2.Settings,
|
138
|
+
run_id: str,
|
139
|
+
) -> None:
|
140
|
+
"""Sends an init request to the service."""
|
141
|
+
request = spb.ServerInformInitRequest()
|
142
|
+
request.settings.CopyFrom(settings)
|
143
|
+
request._info.stream_id = run_id
|
144
|
+
self._client.send(inform_init=request)
|
145
|
+
|
146
|
+
def inform_finish(self, run_id: str) -> None:
|
147
|
+
"""Sends an finish request to the service."""
|
148
|
+
request = spb.ServerInformFinishRequest()
|
149
|
+
request._info.stream_id = run_id
|
150
|
+
self._client.send(inform_finish=request)
|
151
|
+
|
152
|
+
def inform_attach(
|
153
|
+
self,
|
154
|
+
attach_id: str,
|
155
|
+
) -> wandb_settings_pb2.Settings:
|
156
|
+
"""Sends an attach request to the service.
|
157
|
+
|
158
|
+
Raises a WandbAttachFailedError if attaching is not possible.
|
159
|
+
"""
|
160
|
+
request = spb.ServerInformAttachRequest()
|
161
|
+
request._info.stream_id = attach_id
|
162
|
+
|
163
|
+
try:
|
164
|
+
response = self._client.send_and_recv(inform_attach=request)
|
165
|
+
return response.inform_attach_response.settings
|
166
|
+
except SockClientTimeoutError:
|
167
|
+
raise WandbAttachFailedError(
|
168
|
+
"Could not attach because the run does not belong to"
|
169
|
+
" the current service process, or because the service"
|
170
|
+
" process is busy (unlikely)."
|
171
|
+
)
|
172
|
+
|
173
|
+
def inform_start(
|
174
|
+
self,
|
175
|
+
settings: wandb_settings_pb2.Settings,
|
176
|
+
run_id: str,
|
177
|
+
) -> None:
|
178
|
+
"""Sends a start request to the service."""
|
179
|
+
request = spb.ServerInformStartRequest()
|
180
|
+
request.settings.CopyFrom(settings)
|
181
|
+
request._info.stream_id = run_id
|
182
|
+
self._client.send(inform_start=request)
|
183
|
+
|
184
|
+
def teardown(self, exit_code: int) -> int:
|
185
|
+
"""Shuts down the service process and returns its exit code.
|
186
|
+
|
187
|
+
This may only be called once.
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
The exit code of the service process.
|
191
|
+
|
192
|
+
Raises:
|
193
|
+
WandbServiceNotOwnedError: If the current process did not start
|
194
|
+
the service process.
|
195
|
+
"""
|
196
|
+
if not self._proc:
|
197
|
+
raise WandbServiceNotOwnedError(
|
198
|
+
"Cannot tear down service started by different process",
|
199
|
+
)
|
200
|
+
|
201
|
+
assert not self._torn_down
|
202
|
+
self._torn_down = True
|
203
|
+
|
204
|
+
if self._cleanup:
|
205
|
+
self._cleanup()
|
206
|
+
|
207
|
+
# Clear the service token to prevent new connections from being made.
|
208
|
+
service_token.clear_service_token()
|
209
|
+
|
210
|
+
self._client.send(
|
211
|
+
inform_teardown=spb.ServerInformTeardownRequest(
|
212
|
+
exit_code=exit_code,
|
213
|
+
)
|
214
|
+
)
|
215
|
+
|
216
|
+
return self._proc.join()
|
@@ -0,0 +1,94 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import dataclasses
|
4
|
+
import os
|
5
|
+
|
6
|
+
from wandb import env
|
7
|
+
|
8
|
+
_CURRENT_VERSION = "2"
|
9
|
+
_SUPPORTED_TRANSPORTS = "tcp"
|
10
|
+
|
11
|
+
|
12
|
+
def get_service_token() -> ServiceToken | None:
|
13
|
+
"""Reads the token from environment variables.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
The token if the correct environment variable is set, or None.
|
17
|
+
|
18
|
+
Raises:
|
19
|
+
ValueError: If the environment variable is set but cannot be
|
20
|
+
parsed.
|
21
|
+
"""
|
22
|
+
token = os.environ.get(env.SERVICE)
|
23
|
+
if not token:
|
24
|
+
return None
|
25
|
+
|
26
|
+
parts = token.split("-")
|
27
|
+
if len(parts) != 5:
|
28
|
+
raise ValueError(f"Invalid token: {token}")
|
29
|
+
|
30
|
+
version, pid_str, transport, host, port_str = parts
|
31
|
+
|
32
|
+
if version != _CURRENT_VERSION:
|
33
|
+
raise ValueError(
|
34
|
+
f"Expected version {_CURRENT_VERSION},"
|
35
|
+
f" but got {version} (token={token})"
|
36
|
+
)
|
37
|
+
if transport not in _SUPPORTED_TRANSPORTS:
|
38
|
+
raise ValueError(
|
39
|
+
f"Unsupported transport: {transport} (token={token})",
|
40
|
+
)
|
41
|
+
|
42
|
+
try:
|
43
|
+
return ServiceToken(
|
44
|
+
version=version,
|
45
|
+
pid=int(pid_str),
|
46
|
+
transport=transport,
|
47
|
+
host=host,
|
48
|
+
port=int(port_str),
|
49
|
+
)
|
50
|
+
except ValueError as e:
|
51
|
+
raise ValueError(f"Invalid token: {token}") from e
|
52
|
+
|
53
|
+
|
54
|
+
def set_service_token(parent_pid: int, transport: str, host: str, port: int) -> None:
|
55
|
+
"""Stores a service token in an environment variable.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
parent_pid: The process ID of the process that started the service.
|
59
|
+
transport: The transport used to communicate with the service.
|
60
|
+
host: The host part of the internet address on which the service
|
61
|
+
is listening (e.g. localhost).
|
62
|
+
port: The port the service is listening on.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
ValueError: If given an unsupported transport.
|
66
|
+
"""
|
67
|
+
if transport not in _SUPPORTED_TRANSPORTS:
|
68
|
+
raise ValueError(f"Unsupported transport: {transport}")
|
69
|
+
|
70
|
+
os.environ[env.SERVICE] = "-".join(
|
71
|
+
(
|
72
|
+
_CURRENT_VERSION,
|
73
|
+
str(parent_pid),
|
74
|
+
transport,
|
75
|
+
host,
|
76
|
+
str(port),
|
77
|
+
)
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
def clear_service_token() -> None:
|
82
|
+
"""Clears the environment variable storing the service token."""
|
83
|
+
os.environ.pop(env.SERVICE, None)
|
84
|
+
|
85
|
+
|
86
|
+
@dataclasses.dataclass(frozen=True)
|
87
|
+
class ServiceToken:
|
88
|
+
"""An identifier for a running service process."""
|
89
|
+
|
90
|
+
version: str
|
91
|
+
pid: int
|
92
|
+
transport: str
|
93
|
+
host: str
|
94
|
+
port: int
|
wandb/sdk/lib/sock_client.py
CHANGED
@@ -14,9 +14,11 @@ if TYPE_CHECKING:
|
|
14
14
|
|
15
15
|
|
16
16
|
class SockClientClosedError(Exception):
|
17
|
-
"""
|
17
|
+
"""Raised on operations on a closed socket."""
|
18
18
|
|
19
|
-
|
19
|
+
|
20
|
+
class SockClientTimeoutError(Exception):
|
21
|
+
"""Raised if the server didn't respond before the timeout."""
|
20
22
|
|
21
23
|
|
22
24
|
class SockBuffer:
|
@@ -182,8 +184,10 @@ class SockClient:
|
|
182
184
|
# it should be relatively stable.
|
183
185
|
# This pass would be solved as part of the fix in https://wandb.atlassian.net/browse/WB-8709
|
184
186
|
response = self.read_server_response(timeout=1)
|
187
|
+
|
185
188
|
if response is None:
|
186
|
-
raise
|
189
|
+
raise SockClientTimeoutError("No response after 1 second.")
|
190
|
+
|
187
191
|
return response
|
188
192
|
|
189
193
|
def send(
|
wandb/sdk/service/server.py
CHANGED
@@ -20,7 +20,6 @@ class WandbServer:
|
|
20
20
|
_pid: Optional[int]
|
21
21
|
_sock_port: Optional[int]
|
22
22
|
_debug: bool
|
23
|
-
_serve_sock: bool
|
24
23
|
_sock_server: Optional[SocketServer]
|
25
24
|
_startup_debug_enabled: bool
|
26
25
|
|
@@ -31,14 +30,12 @@ class WandbServer:
|
|
31
30
|
address: Optional[str] = None,
|
32
31
|
pid: Optional[int] = None,
|
33
32
|
debug: bool = True,
|
34
|
-
serve_sock: bool = False,
|
35
33
|
) -> None:
|
36
34
|
self._sock_port = sock_port
|
37
35
|
self._port_fname = port_fname
|
38
36
|
self._address = address
|
39
37
|
self._pid = pid
|
40
38
|
self._debug = debug
|
41
|
-
self._serve_sock = serve_sock
|
42
39
|
self._sock_server = None
|
43
40
|
self._startup_debug_enabled = _startup_debug.is_enabled()
|
44
41
|
|
@@ -97,7 +94,7 @@ class WandbServer:
|
|
97
94
|
pid = str(self._pid or 0)
|
98
95
|
transport = "s" if sock_port else "g"
|
99
96
|
port = sock_port or 0
|
100
|
-
# this format is similar to
|
97
|
+
# this format is similar to the service token, but it's purely informative now
|
101
98
|
# (consider unifying this in the future)
|
102
99
|
service_id = f"{service_ver}-{pid}-{transport}-{port}"
|
103
100
|
proc_title = f"wandb-service({service_id})"
|
@@ -109,7 +106,7 @@ class WandbServer:
|
|
109
106
|
self._setup_tracelog()
|
110
107
|
mux = StreamMux()
|
111
108
|
self._startup_debug_print("before_network")
|
112
|
-
sock_port = self._start_sock(mux=mux)
|
109
|
+
sock_port = self._start_sock(mux=mux)
|
113
110
|
self._startup_debug_print("after_network")
|
114
111
|
self._inform_used_ports(sock_port=sock_port)
|
115
112
|
self._startup_debug_print("after_inform")
|
wandb/sdk/service/service.py
CHANGED
@@ -21,8 +21,6 @@ from wandb.sdk.lib.wburls import wburls
|
|
21
21
|
from wandb.util import get_core_path, get_module
|
22
22
|
|
23
23
|
from . import _startup_debug, port_file
|
24
|
-
from .service_base import ServiceInterface
|
25
|
-
from .service_sock import ServiceSockInterface
|
26
24
|
|
27
25
|
if TYPE_CHECKING:
|
28
26
|
from wandb.sdk.wandb_settings import Settings
|
@@ -31,25 +29,18 @@ if TYPE_CHECKING:
|
|
31
29
|
class ServiceStartProcessError(Error):
|
32
30
|
"""Raised when a known error occurs when launching wandb service."""
|
33
31
|
|
34
|
-
pass
|
35
|
-
|
36
32
|
|
37
33
|
class ServiceStartTimeoutError(Error):
|
38
34
|
"""Raised when service start times out."""
|
39
35
|
|
40
|
-
pass
|
41
|
-
|
42
36
|
|
43
37
|
class ServiceStartPortError(Error):
|
44
38
|
"""Raised when service start fails to find a port."""
|
45
39
|
|
46
|
-
pass
|
47
|
-
|
48
40
|
|
49
41
|
class _Service:
|
50
42
|
_settings: "Settings"
|
51
43
|
_sock_port: Optional[int]
|
52
|
-
_service_interface: ServiceInterface
|
53
44
|
_internal_proc: Optional[subprocess.Popen]
|
54
45
|
_startup_debug_enabled: bool
|
55
46
|
|
@@ -65,10 +56,6 @@ class _Service:
|
|
65
56
|
|
66
57
|
_sentry.configure_scope(tags=dict(settings), process_context="service")
|
67
58
|
|
68
|
-
# current code only supports socket server implementation, in the
|
69
|
-
# future we might be able to support both
|
70
|
-
self._service_interface = ServiceSockInterface()
|
71
|
-
|
72
59
|
def _startup_debug_print(self, message: str) -> None:
|
73
60
|
if not self._startup_debug_enabled:
|
74
61
|
return
|
@@ -175,10 +162,6 @@ class _Service:
|
|
175
162
|
if core_debug(default="False"):
|
176
163
|
service_args.append("--debug")
|
177
164
|
|
178
|
-
trace_filename = os.environ.get("_WANDB_TRACE")
|
179
|
-
if trace_filename is not None:
|
180
|
-
service_args.extend(["--trace", trace_filename])
|
181
|
-
|
182
165
|
exec_cmd_list = []
|
183
166
|
termlog(
|
184
167
|
"Using wandb-core as the SDK backend."
|
@@ -194,7 +177,6 @@ class _Service:
|
|
194
177
|
"--pid",
|
195
178
|
pid,
|
196
179
|
]
|
197
|
-
service_args.append("--serve-sock")
|
198
180
|
|
199
181
|
if os.environ.get("WANDB_SERVICE_PROFILE") == "memray":
|
200
182
|
_ = get_module(
|
@@ -253,10 +235,6 @@ class _Service:
|
|
253
235
|
def sock_port(self) -> Optional[int]:
|
254
236
|
return self._sock_port
|
255
237
|
|
256
|
-
@property
|
257
|
-
def service_interface(self) -> ServiceInterface:
|
258
|
-
return self._service_interface
|
259
|
-
|
260
238
|
def join(self) -> int:
|
261
239
|
ret = 0
|
262
240
|
if self._internal_proc:
|
wandb/sdk/wandb_init.py
CHANGED
@@ -655,9 +655,9 @@ class _WandbInit:
|
|
655
655
|
f"Successfully finished last run (ID:{latest_run._run_id}). Initializing new run:<br/>"
|
656
656
|
)
|
657
657
|
elif isinstance(wandb.run, Run):
|
658
|
-
|
658
|
+
service = self._wl.service
|
659
659
|
# We shouldn't return a stale global run if we are in a new pid
|
660
|
-
if not
|
660
|
+
if not service or os.getpid() == wandb.run._init_pid:
|
661
661
|
logger.info("wandb.init() called when a run is still active")
|
662
662
|
with telemetry.context() as tel:
|
663
663
|
tel.feature.init_return_run = True
|
@@ -665,15 +665,20 @@ class _WandbInit:
|
|
665
665
|
|
666
666
|
logger.info("starting backend")
|
667
667
|
|
668
|
-
|
669
|
-
if
|
670
|
-
logger.info("
|
671
|
-
|
672
|
-
settings=self.settings.to_proto(),
|
668
|
+
service = self._wl.service
|
669
|
+
if service:
|
670
|
+
logger.info("sending inform_init request")
|
671
|
+
service.inform_init(
|
672
|
+
settings=self.settings.to_proto(),
|
673
|
+
run_id=self.settings.run_id,
|
673
674
|
)
|
674
675
|
|
675
676
|
mailbox = Mailbox()
|
676
|
-
backend = Backend(
|
677
|
+
backend = Backend(
|
678
|
+
settings=self.settings,
|
679
|
+
service=service,
|
680
|
+
mailbox=mailbox,
|
681
|
+
)
|
677
682
|
backend.ensure_launched()
|
678
683
|
logger.info("backend started and connected")
|
679
684
|
# Make sure we are logged in
|
@@ -739,7 +744,7 @@ class _WandbInit:
|
|
739
744
|
if os.environ.get(wandb.env._DISABLE_SERVICE):
|
740
745
|
tel.feature.service_disabled = True
|
741
746
|
|
742
|
-
if
|
747
|
+
if service:
|
743
748
|
tel.feature.service = True
|
744
749
|
if self.settings._flow_control_disabled:
|
745
750
|
tel.feature.flow_control_disabled = True
|
@@ -830,7 +835,7 @@ class _WandbInit:
|
|
830
835
|
|
831
836
|
if error is not None:
|
832
837
|
logger.error(f"encountered error: {error}")
|
833
|
-
if not
|
838
|
+
if not service:
|
834
839
|
# Shutdown the backend and get rid of the logger
|
835
840
|
# we don't need to do console cleanup at this point
|
836
841
|
backend.cleanup()
|
@@ -857,9 +862,10 @@ class _WandbInit:
|
|
857
862
|
logger.info("starting run threads in backend")
|
858
863
|
# initiate run (stats and metadata probing)
|
859
864
|
|
860
|
-
if
|
861
|
-
|
862
|
-
settings=self.settings.to_proto(),
|
865
|
+
if service:
|
866
|
+
service.inform_start(
|
867
|
+
settings=self.settings.to_proto(),
|
868
|
+
run_id=self.settings.run_id,
|
863
869
|
)
|
864
870
|
|
865
871
|
assert backend.interface
|
@@ -934,33 +940,37 @@ def _attach(
|
|
934
940
|
if logger is None:
|
935
941
|
raise UsageError("logger is not initialized")
|
936
942
|
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
943
|
+
service = _wl.service
|
944
|
+
if not service:
|
945
|
+
raise UsageError(f"Unable to attach to run {attach_id} (no service process)")
|
946
|
+
|
947
|
+
try:
|
948
|
+
attach_settings = service.inform_attach(attach_id=attach_id)
|
949
|
+
except Exception as e:
|
950
|
+
raise UsageError(f"Unable to attach to run {attach_id}") from e
|
941
951
|
|
942
952
|
settings: Settings = copy.copy(_wl._settings)
|
943
953
|
|
944
954
|
settings.update(
|
945
955
|
{
|
946
956
|
"run_id": attach_id,
|
947
|
-
"_start_time":
|
948
|
-
"_start_datetime":
|
949
|
-
"_offline":
|
957
|
+
"_start_time": attach_settings._start_time.value,
|
958
|
+
"_start_datetime": attach_settings._start_datetime.value,
|
959
|
+
"_offline": attach_settings._offline.value,
|
950
960
|
},
|
951
961
|
source=Source.INIT,
|
952
962
|
)
|
953
963
|
|
954
964
|
# TODO: consolidate this codepath with wandb.init()
|
955
965
|
mailbox = Mailbox()
|
956
|
-
backend = Backend(settings=settings,
|
966
|
+
backend = Backend(settings=settings, service=service, mailbox=mailbox)
|
957
967
|
backend.ensure_launched()
|
958
968
|
logger.info("attach backend started and connected")
|
959
969
|
|
960
970
|
if run is None:
|
961
971
|
run = Run(settings=settings)
|
962
972
|
else:
|
963
|
-
run._init()
|
973
|
+
run._init(settings=settings)
|
964
974
|
run._set_library(_wl)
|
965
975
|
run._set_backend(backend)
|
966
976
|
backend._hack_set_run(run)
|