wandb 0.20.1__py3-none-win32.whl → 0.20.2rc20250616__py3-none-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -6
- wandb/__init__.pyi +1 -1
- wandb/analytics/sentry.py +2 -2
- wandb/apis/importers/internals/internal.py +0 -3
- wandb/apis/public/api.py +2 -2
- wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
- wandb/apis/public/registries/registries_search.py +2 -2
- wandb/apis/public/registries/registry.py +19 -18
- wandb/bin/gpu_stats.exe +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/beta.py +1 -7
- wandb/cli/cli.py +0 -30
- wandb/env.py +0 -6
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/internal/handler.py +1 -69
- wandb/sdk/lib/printer.py +6 -7
- wandb/sdk/lib/progress.py +1 -3
- wandb/sdk/lib/service/ipc_support.py +13 -0
- wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
- wandb/sdk/lib/service/service_port_file.py +105 -0
- wandb/sdk/lib/service/service_process.py +111 -0
- wandb/sdk/lib/service/service_token.py +164 -0
- wandb/sdk/lib/sock_client.py +8 -12
- wandb/sdk/wandb_init.py +0 -3
- wandb/sdk/wandb_require.py +9 -20
- wandb/sdk/wandb_run.py +0 -24
- wandb/sdk/wandb_settings.py +0 -9
- wandb/sdk/wandb_setup.py +2 -13
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +42 -68
- wandb/sdk/internal/flow_control.py +0 -263
- wandb/sdk/internal/internal.py +0 -401
- wandb/sdk/internal/internal_util.py +0 -97
- wandb/sdk/internal/system/__init__.py +0 -0
- wandb/sdk/internal/system/assets/__init__.py +0 -25
- wandb/sdk/internal/system/assets/aggregators.py +0 -31
- wandb/sdk/internal/system/assets/asset_registry.py +0 -20
- wandb/sdk/internal/system/assets/cpu.py +0 -163
- wandb/sdk/internal/system/assets/disk.py +0 -210
- wandb/sdk/internal/system/assets/gpu.py +0 -416
- wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
- wandb/sdk/internal/system/assets/interfaces.py +0 -205
- wandb/sdk/internal/system/assets/ipu.py +0 -177
- wandb/sdk/internal/system/assets/memory.py +0 -166
- wandb/sdk/internal/system/assets/network.py +0 -125
- wandb/sdk/internal/system/assets/open_metrics.py +0 -293
- wandb/sdk/internal/system/assets/tpu.py +0 -154
- wandb/sdk/internal/system/assets/trainium.py +0 -393
- wandb/sdk/internal/system/env_probe_helpers.py +0 -13
- wandb/sdk/internal/system/system_info.py +0 -248
- wandb/sdk/internal/system/system_monitor.py +0 -224
- wandb/sdk/internal/writer.py +0 -204
- wandb/sdk/lib/service_token.py +0 -93
- wandb/sdk/service/__init__.py +0 -0
- wandb/sdk/service/_startup_debug.py +0 -22
- wandb/sdk/service/port_file.py +0 -53
- wandb/sdk/service/server.py +0 -107
- wandb/sdk/service/server_sock.py +0 -286
- wandb/sdk/service/service.py +0 -252
- wandb/sdk/service/streams.py +0 -425
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/service/server.py
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
"""wandb server.
|
2
|
-
|
3
|
-
Start up socket transport servers.
|
4
|
-
"""
|
5
|
-
|
6
|
-
import logging
|
7
|
-
import os
|
8
|
-
import sys
|
9
|
-
from typing import Optional
|
10
|
-
|
11
|
-
import wandb
|
12
|
-
|
13
|
-
from . import _startup_debug, port_file
|
14
|
-
from .server_sock import SocketServer
|
15
|
-
from .streams import StreamMux
|
16
|
-
|
17
|
-
|
18
|
-
class WandbServer:
|
19
|
-
_pid: Optional[int]
|
20
|
-
_sock_port: Optional[int]
|
21
|
-
_debug: bool
|
22
|
-
_sock_server: Optional[SocketServer]
|
23
|
-
_startup_debug_enabled: bool
|
24
|
-
|
25
|
-
def __init__(
|
26
|
-
self,
|
27
|
-
sock_port: Optional[int] = None,
|
28
|
-
port_fname: Optional[str] = None,
|
29
|
-
address: Optional[str] = None,
|
30
|
-
pid: Optional[int] = None,
|
31
|
-
debug: bool = True,
|
32
|
-
) -> None:
|
33
|
-
self._sock_port = sock_port
|
34
|
-
self._port_fname = port_fname
|
35
|
-
self._address = address
|
36
|
-
self._pid = pid
|
37
|
-
self._debug = debug
|
38
|
-
self._sock_server = None
|
39
|
-
self._startup_debug_enabled = _startup_debug.is_enabled()
|
40
|
-
|
41
|
-
if debug:
|
42
|
-
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
43
|
-
|
44
|
-
def _inform_used_ports(self, sock_port: Optional[int]) -> None:
|
45
|
-
if not self._port_fname:
|
46
|
-
return
|
47
|
-
pf = port_file.PortFile(sock_port=sock_port)
|
48
|
-
pf.write(self._port_fname)
|
49
|
-
|
50
|
-
def _start_sock(self, mux: StreamMux) -> int:
|
51
|
-
address: str = self._address or "127.0.0.1"
|
52
|
-
port: int = self._sock_port or 0
|
53
|
-
self._sock_server = SocketServer(mux=mux, address=address, port=port)
|
54
|
-
try:
|
55
|
-
self._sock_server.start()
|
56
|
-
port = self._sock_server.port
|
57
|
-
if self._pid:
|
58
|
-
mux.set_pid(self._pid)
|
59
|
-
except KeyboardInterrupt:
|
60
|
-
mux.cleanup()
|
61
|
-
raise
|
62
|
-
except Exception:
|
63
|
-
mux.cleanup()
|
64
|
-
raise
|
65
|
-
return port
|
66
|
-
|
67
|
-
def _stop_servers(self) -> None:
|
68
|
-
if self._sock_server:
|
69
|
-
self._sock_server.stop()
|
70
|
-
|
71
|
-
def _startup_debug_print(self, message: str) -> None:
|
72
|
-
if not self._startup_debug_enabled:
|
73
|
-
return
|
74
|
-
_startup_debug.print_message(message)
|
75
|
-
|
76
|
-
def _setup_proctitle(self, sock_port: Optional[int]) -> None:
|
77
|
-
# TODO: the internal_process should have a better way to have access to
|
78
|
-
# settings.
|
79
|
-
disable_setproctitle = os.environ.get("WANDB_X_DISABLE_SETPROCTITLE")
|
80
|
-
if disable_setproctitle:
|
81
|
-
return
|
82
|
-
|
83
|
-
setproctitle = wandb.util.get_optional_module("setproctitle")
|
84
|
-
if setproctitle:
|
85
|
-
service_ver = 2
|
86
|
-
pid = str(self._pid or 0)
|
87
|
-
transport = "s" if sock_port else "g"
|
88
|
-
port = sock_port or 0
|
89
|
-
# this format is similar to the service token, but it's purely informative now
|
90
|
-
# (consider unifying this in the future)
|
91
|
-
service_id = f"{service_ver}-{pid}-{transport}-{port}"
|
92
|
-
proc_title = f"wandb-service({service_id})"
|
93
|
-
self._startup_debug_print("before_setproctitle")
|
94
|
-
setproctitle.setproctitle(proc_title)
|
95
|
-
self._startup_debug_print("after_setproctitle")
|
96
|
-
|
97
|
-
def serve(self) -> None:
|
98
|
-
mux = StreamMux()
|
99
|
-
self._startup_debug_print("before_network")
|
100
|
-
sock_port = self._start_sock(mux=mux)
|
101
|
-
self._startup_debug_print("after_network")
|
102
|
-
self._inform_used_ports(sock_port=sock_port)
|
103
|
-
self._startup_debug_print("after_inform")
|
104
|
-
self._setup_proctitle(sock_port=sock_port)
|
105
|
-
self._startup_debug_print("before_loop")
|
106
|
-
mux.loop()
|
107
|
-
self._stop_servers()
|
wandb/sdk/service/server_sock.py
DELETED
@@ -1,286 +0,0 @@
|
|
1
|
-
import queue
|
2
|
-
import socket
|
3
|
-
import threading
|
4
|
-
import time
|
5
|
-
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
|
6
|
-
|
7
|
-
import wandb
|
8
|
-
from wandb.proto import wandb_internal_pb2 as pb
|
9
|
-
from wandb.proto import wandb_server_pb2 as spb
|
10
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
11
|
-
|
12
|
-
from ..lib.sock_client import SockClient, SockClientClosedError
|
13
|
-
from .streams import StreamMux
|
14
|
-
|
15
|
-
if TYPE_CHECKING:
|
16
|
-
from threading import Event
|
17
|
-
|
18
|
-
from ..interface.interface_relay import InterfaceRelay
|
19
|
-
|
20
|
-
|
21
|
-
class ClientDict:
|
22
|
-
_client_dict: Dict[str, SockClient]
|
23
|
-
_lock: threading.Lock
|
24
|
-
|
25
|
-
def __init__(self) -> None:
|
26
|
-
self._client_dict = {}
|
27
|
-
self._lock = threading.Lock()
|
28
|
-
|
29
|
-
def get_client(self, client_id: str) -> Optional[SockClient]:
|
30
|
-
with self._lock:
|
31
|
-
client = self._client_dict.get(client_id)
|
32
|
-
return client
|
33
|
-
|
34
|
-
def add_client(self, client: SockClient) -> None:
|
35
|
-
with self._lock:
|
36
|
-
self._client_dict[client._sockid] = client
|
37
|
-
|
38
|
-
def del_client(self, client: SockClient) -> None:
|
39
|
-
with self._lock:
|
40
|
-
del self._client_dict[client._sockid]
|
41
|
-
|
42
|
-
|
43
|
-
class SockServerInterfaceReaderThread(threading.Thread):
|
44
|
-
_socket_client: SockClient
|
45
|
-
_stopped: "Event"
|
46
|
-
|
47
|
-
def __init__(
|
48
|
-
self,
|
49
|
-
clients: ClientDict,
|
50
|
-
iface: "InterfaceRelay",
|
51
|
-
stopped: "Event",
|
52
|
-
) -> None:
|
53
|
-
self._iface = iface
|
54
|
-
self._clients = clients
|
55
|
-
threading.Thread.__init__(self)
|
56
|
-
self.name = "SockSrvIntRdThr"
|
57
|
-
self._stopped = stopped
|
58
|
-
|
59
|
-
def run(self) -> None:
|
60
|
-
while not self._stopped.is_set():
|
61
|
-
try:
|
62
|
-
result = self._iface.relay_q.get(timeout=1)
|
63
|
-
except queue.Empty:
|
64
|
-
continue
|
65
|
-
except OSError:
|
66
|
-
# handle is closed
|
67
|
-
break
|
68
|
-
except ValueError:
|
69
|
-
# queue is closed
|
70
|
-
break
|
71
|
-
sockid = result.control.relay_id
|
72
|
-
assert sockid
|
73
|
-
sock_client = self._clients.get_client(sockid)
|
74
|
-
assert sock_client
|
75
|
-
sresp = spb.ServerResponse()
|
76
|
-
sresp.request_id = result.control.mailbox_slot
|
77
|
-
sresp.result_communicate.CopyFrom(result)
|
78
|
-
sock_client.send_server_response(sresp)
|
79
|
-
|
80
|
-
|
81
|
-
class SockServerReadThread(threading.Thread):
|
82
|
-
_sock_client: SockClient
|
83
|
-
_mux: StreamMux
|
84
|
-
_stopped: "Event"
|
85
|
-
_clients: ClientDict
|
86
|
-
|
87
|
-
def __init__(
|
88
|
-
self, conn: socket.socket, mux: StreamMux, clients: ClientDict
|
89
|
-
) -> None:
|
90
|
-
self._mux = mux
|
91
|
-
threading.Thread.__init__(self)
|
92
|
-
self.name = "SockSrvRdThr"
|
93
|
-
sock_client = SockClient()
|
94
|
-
sock_client.set_socket(conn)
|
95
|
-
self._sock_client = sock_client
|
96
|
-
self._stopped = mux._get_stopped_event()
|
97
|
-
self._clients = clients
|
98
|
-
|
99
|
-
def run(self) -> None:
|
100
|
-
while not self._stopped.is_set():
|
101
|
-
try:
|
102
|
-
sreq = self._sock_client.read_server_request()
|
103
|
-
except SockClientClosedError:
|
104
|
-
# socket has been closed
|
105
|
-
# TODO: shut down other threads serving this socket?
|
106
|
-
break
|
107
|
-
assert sreq, "read_server_request should never timeout"
|
108
|
-
sreq_type = sreq.WhichOneof("server_request_type")
|
109
|
-
shandler_str = "server_" + sreq_type # type: ignore
|
110
|
-
shandler: Callable[[spb.ServerRequest], None] = getattr( # type: ignore
|
111
|
-
self, shandler_str, None
|
112
|
-
)
|
113
|
-
assert shandler, f"unknown handle: {shandler_str}" # type: ignore
|
114
|
-
shandler(sreq)
|
115
|
-
|
116
|
-
def stop(self) -> None:
|
117
|
-
try:
|
118
|
-
# See shutdown notes in class SocketServer for a discussion about this mechanism
|
119
|
-
self._sock_client.shutdown(socket.SHUT_RDWR)
|
120
|
-
except OSError:
|
121
|
-
pass
|
122
|
-
self._sock_client.close()
|
123
|
-
|
124
|
-
def server_inform_init(self, sreq: "spb.ServerRequest") -> None:
|
125
|
-
request = sreq.inform_init
|
126
|
-
stream_id = request._info.stream_id
|
127
|
-
settings = SettingsStatic(request.settings)
|
128
|
-
self._mux.add_stream(stream_id, settings=settings)
|
129
|
-
|
130
|
-
iface = self._mux.get_stream(stream_id).interface
|
131
|
-
self._clients.add_client(self._sock_client)
|
132
|
-
iface_reader_thread = SockServerInterfaceReaderThread(
|
133
|
-
clients=self._clients,
|
134
|
-
iface=iface,
|
135
|
-
stopped=self._stopped,
|
136
|
-
)
|
137
|
-
iface_reader_thread.start()
|
138
|
-
|
139
|
-
def server_inform_start(self, sreq: "spb.ServerRequest") -> None:
|
140
|
-
request = sreq.inform_start
|
141
|
-
stream_id = request._info.stream_id
|
142
|
-
settings = SettingsStatic(request.settings)
|
143
|
-
self._mux.update_stream(stream_id, settings=settings)
|
144
|
-
self._mux.start_stream(stream_id)
|
145
|
-
|
146
|
-
def server_inform_attach(self, sreq: "spb.ServerRequest") -> None:
|
147
|
-
request = sreq.inform_attach
|
148
|
-
stream_id = request._info.stream_id
|
149
|
-
|
150
|
-
self._clients.add_client(self._sock_client)
|
151
|
-
inform_attach_response = spb.ServerInformAttachResponse()
|
152
|
-
inform_attach_response.settings.CopyFrom(
|
153
|
-
self._mux._streams[stream_id]._settings._proto,
|
154
|
-
)
|
155
|
-
response = spb.ServerResponse(
|
156
|
-
request_id=sreq.request_id,
|
157
|
-
inform_attach_response=inform_attach_response,
|
158
|
-
)
|
159
|
-
self._sock_client.send_server_response(response)
|
160
|
-
|
161
|
-
def server_record_communicate(self, sreq: "spb.ServerRequest") -> None:
|
162
|
-
self._put_record(sreq.record_communicate)
|
163
|
-
|
164
|
-
def server_record_publish(self, sreq: "spb.ServerRequest") -> None:
|
165
|
-
self._put_record(sreq.record_publish)
|
166
|
-
|
167
|
-
def _put_record(self, record: "pb.Record") -> None:
|
168
|
-
# encode relay information so the right socket picks up the data
|
169
|
-
record.control.relay_id = self._sock_client._sockid
|
170
|
-
stream_id = record._info.stream_id
|
171
|
-
|
172
|
-
try:
|
173
|
-
iface = self._mux.get_stream(stream_id).interface
|
174
|
-
|
175
|
-
except KeyError:
|
176
|
-
# We should log the error but cannot because it may print to console
|
177
|
-
# due to how logging is set up. This error usually happens if
|
178
|
-
# a record is sent when no run is active, but during this time the
|
179
|
-
# logger prints to the console.
|
180
|
-
pass
|
181
|
-
|
182
|
-
else:
|
183
|
-
assert iface.record_q
|
184
|
-
iface.record_q.put(record)
|
185
|
-
|
186
|
-
def server_inform_finish(self, sreq: "spb.ServerRequest") -> None:
|
187
|
-
request = sreq.inform_finish
|
188
|
-
stream_id = request._info.stream_id
|
189
|
-
self._mux.drop_stream(stream_id)
|
190
|
-
|
191
|
-
def server_inform_teardown(self, sreq: "spb.ServerRequest") -> None:
|
192
|
-
request = sreq.inform_teardown
|
193
|
-
exit_code = request.exit_code
|
194
|
-
self._mux.teardown(exit_code)
|
195
|
-
|
196
|
-
|
197
|
-
class SockAcceptThread(threading.Thread):
|
198
|
-
_sock: socket.socket
|
199
|
-
_mux: StreamMux
|
200
|
-
_stopped: "Event"
|
201
|
-
_clients: ClientDict
|
202
|
-
|
203
|
-
def __init__(self, sock: socket.socket, mux: StreamMux) -> None:
|
204
|
-
self._sock = sock
|
205
|
-
self._mux = mux
|
206
|
-
self._stopped = mux._get_stopped_event()
|
207
|
-
threading.Thread.__init__(self)
|
208
|
-
self.name = "SockAcceptThr"
|
209
|
-
self._clients = ClientDict()
|
210
|
-
|
211
|
-
def run(self) -> None:
|
212
|
-
read_threads = []
|
213
|
-
|
214
|
-
while not self._stopped.is_set():
|
215
|
-
try:
|
216
|
-
conn, addr = self._sock.accept()
|
217
|
-
except ConnectionAbortedError:
|
218
|
-
break
|
219
|
-
except OSError:
|
220
|
-
# on shutdown
|
221
|
-
break
|
222
|
-
sr = SockServerReadThread(conn=conn, mux=self._mux, clients=self._clients)
|
223
|
-
sr.start()
|
224
|
-
read_threads.append(sr)
|
225
|
-
|
226
|
-
for rt in read_threads:
|
227
|
-
rt.stop()
|
228
|
-
|
229
|
-
|
230
|
-
class DebugThread(threading.Thread):
|
231
|
-
def __init__(self, mux: "StreamMux") -> None:
|
232
|
-
threading.Thread.__init__(self)
|
233
|
-
self.daemon = True
|
234
|
-
self.name = "DebugThr"
|
235
|
-
|
236
|
-
def run(self) -> None:
|
237
|
-
while True:
|
238
|
-
time.sleep(30)
|
239
|
-
for thread in threading.enumerate():
|
240
|
-
wandb.termwarn(f"DEBUG: {thread.name}")
|
241
|
-
|
242
|
-
|
243
|
-
class SocketServer:
|
244
|
-
_mux: StreamMux
|
245
|
-
_address: str
|
246
|
-
_port: int
|
247
|
-
_sock: socket.socket
|
248
|
-
|
249
|
-
def __init__(self, mux: Any, address: str, port: int) -> None:
|
250
|
-
self._mux = mux
|
251
|
-
self._address = address
|
252
|
-
self._port = port
|
253
|
-
# This is the server socket that we accept new connections from
|
254
|
-
self._sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
255
|
-
|
256
|
-
def _bind(self) -> None:
|
257
|
-
self._sock.bind((self._address, self._port))
|
258
|
-
self._port = self._sock.getsockname()[1]
|
259
|
-
|
260
|
-
@property
|
261
|
-
def port(self) -> int:
|
262
|
-
return self._port
|
263
|
-
|
264
|
-
def start(self) -> None:
|
265
|
-
self._bind()
|
266
|
-
self._sock.listen(5)
|
267
|
-
self._thread = SockAcceptThread(sock=self._sock, mux=self._mux)
|
268
|
-
self._thread.start()
|
269
|
-
# Note: Uncomment to figure out what thread is not exiting properly
|
270
|
-
# self._dbg_thread = DebugThread(mux=self._mux)
|
271
|
-
# self._dbg_thread.start()
|
272
|
-
|
273
|
-
def stop(self) -> None:
|
274
|
-
if self._sock:
|
275
|
-
# we need to stop the SockAcceptThread
|
276
|
-
try:
|
277
|
-
# TODO(jhr): consider a more graceful shutdown in the future
|
278
|
-
# socket.shutdown() is a more heavy handed approach to interrupting socket.accept()
|
279
|
-
# in the future we might want to consider a more graceful shutdown which would involve setting
|
280
|
-
# a threading Event and then initiating one last connection just to close down the thread
|
281
|
-
# The advantage of the heavy handed approach is that it does not depend on the threads functioning
|
282
|
-
# properly, that is, if something has gone wrong, we probably want to use this hammer to shut things down
|
283
|
-
self._sock.shutdown(socket.SHUT_RDWR)
|
284
|
-
except OSError:
|
285
|
-
pass
|
286
|
-
self._sock.close()
|
wandb/sdk/service/service.py
DELETED
@@ -1,252 +0,0 @@
|
|
1
|
-
"""Reliably launch and connect to backend server process (wandb service).
|
2
|
-
|
3
|
-
Backend server process can be connected to using tcp sockets transport.
|
4
|
-
"""
|
5
|
-
|
6
|
-
import datetime
|
7
|
-
import os
|
8
|
-
import pathlib
|
9
|
-
import platform
|
10
|
-
import shutil
|
11
|
-
import subprocess
|
12
|
-
import sys
|
13
|
-
import tempfile
|
14
|
-
import time
|
15
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional
|
16
|
-
|
17
|
-
from wandb import _sentry
|
18
|
-
from wandb.env import (
|
19
|
-
core_debug,
|
20
|
-
dcgm_profiling_enabled,
|
21
|
-
error_reporting_enabled,
|
22
|
-
is_require_legacy_service,
|
23
|
-
)
|
24
|
-
from wandb.errors import Error, WandbCoreNotAvailableError
|
25
|
-
from wandb.errors.term import termlog, termwarn
|
26
|
-
from wandb.util import get_core_path, get_module
|
27
|
-
|
28
|
-
from . import _startup_debug, port_file
|
29
|
-
|
30
|
-
if TYPE_CHECKING:
|
31
|
-
from wandb.sdk.wandb_settings import Settings
|
32
|
-
|
33
|
-
|
34
|
-
class ServiceStartProcessError(Error):
|
35
|
-
"""Raised when a known error occurs when launching wandb service."""
|
36
|
-
|
37
|
-
|
38
|
-
class ServiceStartTimeoutError(Error):
|
39
|
-
"""Raised when service start times out."""
|
40
|
-
|
41
|
-
|
42
|
-
class ServiceStartPortError(Error):
|
43
|
-
"""Raised when service start fails to find a port."""
|
44
|
-
|
45
|
-
|
46
|
-
class _Service:
|
47
|
-
_settings: "Settings"
|
48
|
-
_sock_port: Optional[int]
|
49
|
-
_internal_proc: Optional[subprocess.Popen]
|
50
|
-
_startup_debug_enabled: bool
|
51
|
-
|
52
|
-
def __init__(
|
53
|
-
self,
|
54
|
-
settings: "Settings",
|
55
|
-
) -> None:
|
56
|
-
self._settings = settings
|
57
|
-
self._stub = None
|
58
|
-
self._sock_port = None
|
59
|
-
self._internal_proc = None
|
60
|
-
self._startup_debug_enabled = _startup_debug.is_enabled()
|
61
|
-
|
62
|
-
_sentry.configure_scope(tags=dict(settings), process_context="service")
|
63
|
-
|
64
|
-
def _startup_debug_print(self, message: str) -> None:
|
65
|
-
if not self._startup_debug_enabled:
|
66
|
-
return
|
67
|
-
_startup_debug.print_message(message)
|
68
|
-
|
69
|
-
def _wait_for_ports(
|
70
|
-
self, fname: str, proc: Optional[subprocess.Popen] = None
|
71
|
-
) -> None:
|
72
|
-
"""Wait for the service to write the port file and then read it.
|
73
|
-
|
74
|
-
Args:
|
75
|
-
fname: The path to the port file.
|
76
|
-
proc: The process to wait for.
|
77
|
-
|
78
|
-
Raises:
|
79
|
-
ServiceStartTimeoutError: If the service takes too long to start.
|
80
|
-
ServiceStartPortError: If the service writes an invalid port file or unable to read it.
|
81
|
-
ServiceStartProcessError: If the service process exits unexpectedly.
|
82
|
-
|
83
|
-
"""
|
84
|
-
time_max = time.monotonic() + self._settings.x_service_wait
|
85
|
-
while time.monotonic() < time_max:
|
86
|
-
if proc and proc.poll():
|
87
|
-
# process finished
|
88
|
-
# define these variables for sentry context grab:
|
89
|
-
# command = proc.args
|
90
|
-
# sys_executable = sys.executable
|
91
|
-
# which_python = shutil.which("python3")
|
92
|
-
# proc_out = proc.stdout.read()
|
93
|
-
# proc_err = proc.stderr.read()
|
94
|
-
context = dict(
|
95
|
-
command=proc.args,
|
96
|
-
sys_executable=sys.executable,
|
97
|
-
which_python=shutil.which("python3"),
|
98
|
-
proc_out=proc.stdout.read() if proc.stdout else "",
|
99
|
-
proc_err=proc.stderr.read() if proc.stderr else "",
|
100
|
-
)
|
101
|
-
raise ServiceStartProcessError(
|
102
|
-
f"The wandb service process exited with {proc.returncode}. "
|
103
|
-
"Ensure that `sys.executable` is a valid python interpreter. "
|
104
|
-
"You can override it with the `_executable` setting "
|
105
|
-
"or with the `WANDB_X_EXECUTABLE` environment variable."
|
106
|
-
f"\n{context}",
|
107
|
-
context=context,
|
108
|
-
)
|
109
|
-
if not os.path.isfile(fname):
|
110
|
-
time.sleep(0.2)
|
111
|
-
continue
|
112
|
-
try:
|
113
|
-
pf = port_file.PortFile()
|
114
|
-
pf.read(fname)
|
115
|
-
if not pf.is_valid:
|
116
|
-
time.sleep(0.2)
|
117
|
-
continue
|
118
|
-
self._sock_port = pf.sock_port
|
119
|
-
except Exception as e:
|
120
|
-
# todo: point at the docs. this could be due to a number of reasons,
|
121
|
-
# for example, being unable to write to the port file etc.
|
122
|
-
raise ServiceStartPortError(
|
123
|
-
f"Failed to allocate port for wandb service: {e}."
|
124
|
-
)
|
125
|
-
return
|
126
|
-
raise ServiceStartTimeoutError(
|
127
|
-
"Timed out waiting for wandb service to start after "
|
128
|
-
f"{self._settings.x_service_wait} seconds. "
|
129
|
-
"Try increasing the timeout with the `_service_wait` setting."
|
130
|
-
)
|
131
|
-
|
132
|
-
def _launch_server(self) -> None:
|
133
|
-
"""Launch server and set ports."""
|
134
|
-
# References for starting processes
|
135
|
-
# - https://github.com/wandb/wandb/blob/archive/old-cli/wandb/__init__.py
|
136
|
-
# - https://stackoverflow.com/questions/1196074/how-to-start-a-background-process-in-python
|
137
|
-
self._startup_debug_print("launch")
|
138
|
-
|
139
|
-
kwargs: Dict[str, Any] = dict(close_fds=True)
|
140
|
-
# flags to handle keyboard interrupt signal that is causing a hang
|
141
|
-
if platform.system() == "Windows":
|
142
|
-
kwargs.update(creationflags=subprocess.CREATE_NEW_PROCESS_GROUP) # type: ignore [attr-defined]
|
143
|
-
else:
|
144
|
-
kwargs.update(start_new_session=True)
|
145
|
-
|
146
|
-
pid = str(os.getpid())
|
147
|
-
|
148
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
149
|
-
fname = os.path.join(tmpdir, f"port-{pid}.txt")
|
150
|
-
|
151
|
-
executable = self._settings.x_executable
|
152
|
-
exec_cmd_list = [executable, "-m"]
|
153
|
-
|
154
|
-
service_args = []
|
155
|
-
|
156
|
-
if not is_require_legacy_service():
|
157
|
-
try:
|
158
|
-
core_path = get_core_path()
|
159
|
-
except WandbCoreNotAvailableError as e:
|
160
|
-
_sentry.reraise(e)
|
161
|
-
|
162
|
-
service_args.extend([core_path])
|
163
|
-
|
164
|
-
if not error_reporting_enabled():
|
165
|
-
service_args.append("--no-observability")
|
166
|
-
|
167
|
-
if core_debug(default="False"):
|
168
|
-
service_args.extend(["--log-level", "-4"])
|
169
|
-
|
170
|
-
if dcgm_profiling_enabled():
|
171
|
-
service_args.append("--enable-dcgm-profiling")
|
172
|
-
|
173
|
-
exec_cmd_list = []
|
174
|
-
else:
|
175
|
-
service_args.extend(["wandb", "service", "--debug"])
|
176
|
-
termwarn(
|
177
|
-
"Using legacy-service, which is deprecated. If this is"
|
178
|
-
" unintentional, you can fix it by ensuring you do not call"
|
179
|
-
" `wandb.require('legacy-service')` and do not set the"
|
180
|
-
" WANDB_X_REQUIRE_LEGACY_SERVICE environment"
|
181
|
-
" variable."
|
182
|
-
)
|
183
|
-
|
184
|
-
service_args += [
|
185
|
-
"--port-filename",
|
186
|
-
fname,
|
187
|
-
"--pid",
|
188
|
-
pid,
|
189
|
-
]
|
190
|
-
|
191
|
-
if os.environ.get("WANDB_SERVICE_PROFILE") == "memray":
|
192
|
-
_ = get_module(
|
193
|
-
"memray",
|
194
|
-
required=(
|
195
|
-
"wandb service memory profiling requires memray, "
|
196
|
-
"install with `pip install memray`"
|
197
|
-
),
|
198
|
-
)
|
199
|
-
|
200
|
-
time_tag = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
201
|
-
output_file = f"wandb_service.memray.{time_tag}.bin"
|
202
|
-
cli_executable = (
|
203
|
-
pathlib.Path(__file__).parent.parent.parent.parent
|
204
|
-
/ "tools"
|
205
|
-
/ "cli.py"
|
206
|
-
)
|
207
|
-
exec_cmd_list = [
|
208
|
-
executable,
|
209
|
-
"-m",
|
210
|
-
"memray",
|
211
|
-
"run",
|
212
|
-
"-o",
|
213
|
-
output_file,
|
214
|
-
]
|
215
|
-
service_args[0] = str(cli_executable)
|
216
|
-
termlog(
|
217
|
-
f"wandb service memory profiling enabled, output file: {output_file}"
|
218
|
-
)
|
219
|
-
termlog(
|
220
|
-
f"Convert to flamegraph with: `python -m memray flamegraph {output_file}`"
|
221
|
-
)
|
222
|
-
|
223
|
-
try:
|
224
|
-
internal_proc = subprocess.Popen(
|
225
|
-
exec_cmd_list + service_args, # type: ignore[arg-type]
|
226
|
-
env=os.environ,
|
227
|
-
**kwargs,
|
228
|
-
)
|
229
|
-
except Exception as e:
|
230
|
-
_sentry.reraise(e)
|
231
|
-
|
232
|
-
self._startup_debug_print("wait_ports")
|
233
|
-
try:
|
234
|
-
self._wait_for_ports(fname, proc=internal_proc)
|
235
|
-
except Exception as e:
|
236
|
-
_sentry.reraise(e)
|
237
|
-
self._startup_debug_print("wait_ports_done")
|
238
|
-
self._internal_proc = internal_proc
|
239
|
-
self._startup_debug_print("launch_done")
|
240
|
-
|
241
|
-
def start(self) -> None:
|
242
|
-
self._launch_server()
|
243
|
-
|
244
|
-
@property
|
245
|
-
def sock_port(self) -> Optional[int]:
|
246
|
-
return self._sock_port
|
247
|
-
|
248
|
-
def join(self) -> int:
|
249
|
-
ret = 0
|
250
|
-
if self._internal_proc:
|
251
|
-
ret = self._internal_proc.wait()
|
252
|
-
return ret
|