wandb 0.20.1__py3-none-any.whl → 0.20.2rc20250616__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -6
- wandb/__init__.pyi +1 -1
- wandb/analytics/sentry.py +2 -2
- wandb/apis/importers/internals/internal.py +0 -3
- wandb/apis/public/api.py +2 -2
- wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
- wandb/apis/public/registries/registries_search.py +2 -2
- wandb/apis/public/registries/registry.py +19 -18
- wandb/bin/gpu_stats +0 -0
- wandb/cli/beta.py +1 -7
- wandb/cli/cli.py +0 -30
- wandb/env.py +0 -6
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/internal/handler.py +1 -69
- wandb/sdk/lib/printer.py +6 -7
- wandb/sdk/lib/progress.py +1 -3
- wandb/sdk/lib/service/ipc_support.py +13 -0
- wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
- wandb/sdk/lib/service/service_port_file.py +105 -0
- wandb/sdk/lib/service/service_process.py +111 -0
- wandb/sdk/lib/service/service_token.py +164 -0
- wandb/sdk/lib/sock_client.py +8 -12
- wandb/sdk/wandb_init.py +0 -3
- wandb/sdk/wandb_require.py +9 -20
- wandb/sdk/wandb_run.py +0 -24
- wandb/sdk/wandb_settings.py +0 -9
- wandb/sdk/wandb_setup.py +2 -13
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +41 -67
- wandb/sdk/internal/flow_control.py +0 -263
- wandb/sdk/internal/internal.py +0 -401
- wandb/sdk/internal/internal_util.py +0 -97
- wandb/sdk/internal/system/__init__.py +0 -0
- wandb/sdk/internal/system/assets/__init__.py +0 -25
- wandb/sdk/internal/system/assets/aggregators.py +0 -31
- wandb/sdk/internal/system/assets/asset_registry.py +0 -20
- wandb/sdk/internal/system/assets/cpu.py +0 -163
- wandb/sdk/internal/system/assets/disk.py +0 -210
- wandb/sdk/internal/system/assets/gpu.py +0 -416
- wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
- wandb/sdk/internal/system/assets/interfaces.py +0 -205
- wandb/sdk/internal/system/assets/ipu.py +0 -177
- wandb/sdk/internal/system/assets/memory.py +0 -166
- wandb/sdk/internal/system/assets/network.py +0 -125
- wandb/sdk/internal/system/assets/open_metrics.py +0 -293
- wandb/sdk/internal/system/assets/tpu.py +0 -154
- wandb/sdk/internal/system/assets/trainium.py +0 -393
- wandb/sdk/internal/system/env_probe_helpers.py +0 -13
- wandb/sdk/internal/system/system_info.py +0 -248
- wandb/sdk/internal/system/system_monitor.py +0 -224
- wandb/sdk/internal/writer.py +0 -204
- wandb/sdk/lib/service_token.py +0 -93
- wandb/sdk/service/__init__.py +0 -0
- wandb/sdk/service/_startup_debug.py +0 -22
- wandb/sdk/service/port_file.py +0 -53
- wandb/sdk/service/server.py +0 -107
- wandb/sdk/service/server_sock.py +0 -286
- wandb/sdk/service/service.py +0 -252
- wandb/sdk/service/streams.py +0 -425
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/service/streams.py
DELETED
@@ -1,425 +0,0 @@
|
|
1
|
-
"""streams: class that manages internal threads for each run.
|
2
|
-
|
3
|
-
StreamThread: Thread that runs internal.wandb_internal()
|
4
|
-
StreamRecord: All the external state for the internal thread (queues, etc)
|
5
|
-
StreamAction: Lightweight record for stream ops for thread safety
|
6
|
-
StreamMux: Container for dictionary of stream threads per runid
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
import asyncio
|
12
|
-
import functools
|
13
|
-
import queue
|
14
|
-
import threading
|
15
|
-
import time
|
16
|
-
from threading import Event
|
17
|
-
from typing import Any, Callable, NoReturn
|
18
|
-
|
19
|
-
import psutil
|
20
|
-
|
21
|
-
from wandb.proto import wandb_internal_pb2 as pb
|
22
|
-
from wandb.sdk.interface.interface_relay import InterfaceRelay
|
23
|
-
from wandb.sdk.interface.router_relay import MessageRelayRouter
|
24
|
-
from wandb.sdk.internal.internal import wandb_internal
|
25
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
26
|
-
from wandb.sdk.lib import asyncio_compat, progress
|
27
|
-
from wandb.sdk.lib import printer as printerlib
|
28
|
-
from wandb.sdk.mailbox import Mailbox, MailboxHandle, wait_all_with_progress
|
29
|
-
from wandb.sdk.wandb_run import Run
|
30
|
-
|
31
|
-
|
32
|
-
class StreamThread(threading.Thread):
|
33
|
-
"""Class to running internal process as a thread."""
|
34
|
-
|
35
|
-
def __init__(self, target: Callable, kwargs: dict[str, Any]) -> None:
|
36
|
-
threading.Thread.__init__(self)
|
37
|
-
self.name = "StreamThr"
|
38
|
-
self._target = target
|
39
|
-
self._kwargs = kwargs
|
40
|
-
self.daemon = True
|
41
|
-
|
42
|
-
def run(self) -> None:
|
43
|
-
# TODO: catch exceptions and report errors to scheduler
|
44
|
-
self._target(**self._kwargs)
|
45
|
-
|
46
|
-
|
47
|
-
class StreamRecord:
|
48
|
-
_record_q: queue.Queue[pb.Record]
|
49
|
-
_result_q: queue.Queue[pb.Result]
|
50
|
-
_relay_q: queue.Queue[pb.Result]
|
51
|
-
_iface: InterfaceRelay
|
52
|
-
_thread: StreamThread
|
53
|
-
_settings: SettingsStatic
|
54
|
-
_started: bool
|
55
|
-
|
56
|
-
def __init__(self, settings: SettingsStatic) -> None:
|
57
|
-
self._started = False
|
58
|
-
self._mailbox = Mailbox()
|
59
|
-
self._record_q = queue.Queue()
|
60
|
-
self._result_q = queue.Queue()
|
61
|
-
self._relay_q = queue.Queue()
|
62
|
-
self._router = MessageRelayRouter(
|
63
|
-
request_queue=self._record_q,
|
64
|
-
response_queue=self._result_q,
|
65
|
-
relay_queue=self._relay_q,
|
66
|
-
mailbox=self._mailbox,
|
67
|
-
)
|
68
|
-
self._iface = InterfaceRelay(
|
69
|
-
record_q=self._record_q,
|
70
|
-
result_q=self._result_q,
|
71
|
-
relay_q=self._relay_q,
|
72
|
-
mailbox=self._mailbox,
|
73
|
-
)
|
74
|
-
self._settings = settings
|
75
|
-
|
76
|
-
def start_thread(self, thread: StreamThread) -> None:
|
77
|
-
self._thread = thread
|
78
|
-
thread.start()
|
79
|
-
self._wait_thread_active()
|
80
|
-
|
81
|
-
def _wait_thread_active(self) -> None:
|
82
|
-
self._iface.deliver_status().wait_or(timeout=None)
|
83
|
-
|
84
|
-
def join(self) -> None:
|
85
|
-
self._iface.join()
|
86
|
-
self._router.join()
|
87
|
-
if self._thread:
|
88
|
-
self._thread.join()
|
89
|
-
|
90
|
-
def drop(self) -> None:
|
91
|
-
self._iface._drop = True
|
92
|
-
|
93
|
-
@property
|
94
|
-
def interface(self) -> InterfaceRelay:
|
95
|
-
return self._iface
|
96
|
-
|
97
|
-
def mark_started(self) -> None:
|
98
|
-
self._started = True
|
99
|
-
|
100
|
-
def update(self, settings: SettingsStatic) -> None:
|
101
|
-
# Note: Currently just overriding the _settings attribute
|
102
|
-
# once we use Settings Class we might want to properly update it
|
103
|
-
self._settings = settings
|
104
|
-
|
105
|
-
|
106
|
-
class StreamAction:
|
107
|
-
_action: str
|
108
|
-
_stream_id: str
|
109
|
-
_processed: Event
|
110
|
-
_data: Any
|
111
|
-
|
112
|
-
def __init__(self, action: str, stream_id: str, data: Any | None = None):
|
113
|
-
self._action = action
|
114
|
-
self._stream_id = stream_id
|
115
|
-
self._data = data
|
116
|
-
self._processed = Event()
|
117
|
-
|
118
|
-
def __repr__(self) -> str:
|
119
|
-
return f"StreamAction({self._action},{self._stream_id})"
|
120
|
-
|
121
|
-
def wait_handled(self) -> None:
|
122
|
-
self._processed.wait()
|
123
|
-
|
124
|
-
def set_handled(self) -> None:
|
125
|
-
self._processed.set()
|
126
|
-
|
127
|
-
@property
|
128
|
-
def stream_id(self) -> str:
|
129
|
-
return self._stream_id
|
130
|
-
|
131
|
-
|
132
|
-
class StreamMux:
|
133
|
-
_streams_lock: threading.Lock
|
134
|
-
_streams: dict[str, StreamRecord]
|
135
|
-
_port: int | None
|
136
|
-
_pid: int | None
|
137
|
-
_action_q: queue.Queue[StreamAction]
|
138
|
-
_stopped: Event
|
139
|
-
_pid_checked_ts: float | None
|
140
|
-
|
141
|
-
def __init__(self) -> None:
|
142
|
-
self._streams_lock = threading.Lock()
|
143
|
-
self._streams = dict()
|
144
|
-
self._port = None
|
145
|
-
self._pid = None
|
146
|
-
self._stopped = Event()
|
147
|
-
self._action_q = queue.Queue()
|
148
|
-
self._pid_checked_ts = None
|
149
|
-
|
150
|
-
def _get_stopped_event(self) -> Event:
|
151
|
-
# TODO: clean this up, there should be a better way to abstract this
|
152
|
-
return self._stopped
|
153
|
-
|
154
|
-
def set_port(self, port: int) -> None:
|
155
|
-
self._port = port
|
156
|
-
|
157
|
-
def set_pid(self, pid: int) -> None:
|
158
|
-
self._pid = pid
|
159
|
-
|
160
|
-
def add_stream(self, stream_id: str, settings: SettingsStatic) -> None:
|
161
|
-
action = StreamAction(action="add", stream_id=stream_id, data=settings)
|
162
|
-
self._action_q.put(action)
|
163
|
-
action.wait_handled()
|
164
|
-
|
165
|
-
def start_stream(self, stream_id: str) -> None:
|
166
|
-
action = StreamAction(action="start", stream_id=stream_id)
|
167
|
-
self._action_q.put(action)
|
168
|
-
action.wait_handled()
|
169
|
-
|
170
|
-
def update_stream(self, stream_id: str, settings: SettingsStatic) -> None:
|
171
|
-
action = StreamAction(action="update", stream_id=stream_id, data=settings)
|
172
|
-
self._action_q.put(action)
|
173
|
-
action.wait_handled()
|
174
|
-
|
175
|
-
def del_stream(self, stream_id: str) -> None:
|
176
|
-
action = StreamAction(action="del", stream_id=stream_id)
|
177
|
-
self._action_q.put(action)
|
178
|
-
action.wait_handled()
|
179
|
-
|
180
|
-
def drop_stream(self, stream_id: str) -> None:
|
181
|
-
action = StreamAction(action="drop", stream_id=stream_id)
|
182
|
-
self._action_q.put(action)
|
183
|
-
action.wait_handled()
|
184
|
-
|
185
|
-
def teardown(self, exit_code: int) -> None:
|
186
|
-
action = StreamAction(action="teardown", stream_id="na", data=exit_code)
|
187
|
-
self._action_q.put(action)
|
188
|
-
action.wait_handled()
|
189
|
-
|
190
|
-
def stream_names(self) -> list[str]:
|
191
|
-
with self._streams_lock:
|
192
|
-
names = list(self._streams.keys())
|
193
|
-
return names
|
194
|
-
|
195
|
-
def has_stream(self, stream_id: str) -> bool:
|
196
|
-
with self._streams_lock:
|
197
|
-
return stream_id in self._streams
|
198
|
-
|
199
|
-
def get_stream(self, stream_id: str) -> StreamRecord:
|
200
|
-
"""Returns the StreamRecord for the ID.
|
201
|
-
|
202
|
-
Raises:
|
203
|
-
KeyError: If a corresponding StreamRecord does not exist.
|
204
|
-
"""
|
205
|
-
with self._streams_lock:
|
206
|
-
stream = self._streams[stream_id]
|
207
|
-
return stream
|
208
|
-
|
209
|
-
def _process_add(self, action: StreamAction) -> None:
|
210
|
-
stream = StreamRecord(action._data)
|
211
|
-
# run_id = action.stream_id # will want to fix if a streamid != runid
|
212
|
-
settings = action._data
|
213
|
-
thread = StreamThread(
|
214
|
-
target=wandb_internal,
|
215
|
-
kwargs=dict(
|
216
|
-
settings=settings,
|
217
|
-
record_q=stream._record_q,
|
218
|
-
result_q=stream._result_q,
|
219
|
-
port=self._port,
|
220
|
-
user_pid=self._pid,
|
221
|
-
),
|
222
|
-
)
|
223
|
-
stream.start_thread(thread)
|
224
|
-
with self._streams_lock:
|
225
|
-
self._streams[action._stream_id] = stream
|
226
|
-
|
227
|
-
def _process_start(self, action: StreamAction) -> None:
|
228
|
-
with self._streams_lock:
|
229
|
-
self._streams[action._stream_id].mark_started()
|
230
|
-
|
231
|
-
def _process_update(self, action: StreamAction) -> None:
|
232
|
-
with self._streams_lock:
|
233
|
-
self._streams[action._stream_id].update(action._data)
|
234
|
-
|
235
|
-
def _process_del(self, action: StreamAction) -> None:
|
236
|
-
with self._streams_lock:
|
237
|
-
stream = self._streams.pop(action._stream_id)
|
238
|
-
stream.join()
|
239
|
-
# TODO: we assume stream has already been shutdown. should we verify?
|
240
|
-
|
241
|
-
def _process_drop(self, action: StreamAction) -> None:
|
242
|
-
with self._streams_lock:
|
243
|
-
if action._stream_id in self._streams:
|
244
|
-
stream = self._streams.pop(action._stream_id)
|
245
|
-
stream.drop()
|
246
|
-
stream.join()
|
247
|
-
|
248
|
-
async def _finish_all_progress(
|
249
|
-
self,
|
250
|
-
progress_printer: progress.ProgressPrinter,
|
251
|
-
streams_to_watch: dict[str, StreamRecord],
|
252
|
-
) -> None:
|
253
|
-
"""Poll the streams and display statistics about them.
|
254
|
-
|
255
|
-
This never returns and must be cancelled.
|
256
|
-
|
257
|
-
Args:
|
258
|
-
progress_printer: Printer to use for displaying finish progress.
|
259
|
-
streams_to_watch: Streams to poll for finish progress.
|
260
|
-
"""
|
261
|
-
results: dict[str, pb.Result | None] = {}
|
262
|
-
|
263
|
-
async def loop_poll_stream(
|
264
|
-
stream_id: str,
|
265
|
-
stream: StreamRecord,
|
266
|
-
) -> NoReturn:
|
267
|
-
while True:
|
268
|
-
start_time = time.monotonic()
|
269
|
-
|
270
|
-
handle = stream.interface.deliver_poll_exit()
|
271
|
-
results[stream_id] = await handle.wait_async(timeout=None)
|
272
|
-
|
273
|
-
elapsed_time = time.monotonic() - start_time
|
274
|
-
if elapsed_time < 1:
|
275
|
-
await asyncio.sleep(1 - elapsed_time)
|
276
|
-
|
277
|
-
async def loop_update_printer() -> NoReturn:
|
278
|
-
while True:
|
279
|
-
poll_exit_responses: list[pb.PollExitResponse] = []
|
280
|
-
for result in results.values():
|
281
|
-
if not result or not result.response:
|
282
|
-
continue
|
283
|
-
if poll_exit_response := result.response.poll_exit_response:
|
284
|
-
poll_exit_responses.append(poll_exit_response)
|
285
|
-
|
286
|
-
progress_printer.update(poll_exit_responses)
|
287
|
-
await asyncio.sleep(1)
|
288
|
-
|
289
|
-
async with asyncio_compat.open_task_group() as task_group:
|
290
|
-
for stream_id, stream in streams_to_watch.items():
|
291
|
-
task_group.start_soon(loop_poll_stream(stream_id, stream))
|
292
|
-
task_group.start_soon(loop_update_printer())
|
293
|
-
|
294
|
-
def _finish_all(self, streams: dict[str, StreamRecord], exit_code: int) -> None:
|
295
|
-
if not streams:
|
296
|
-
return
|
297
|
-
|
298
|
-
printer = printerlib.new_printer()
|
299
|
-
|
300
|
-
# fixme: for now we have a single printer for all streams,
|
301
|
-
# and jupyter is disabled if at least single stream's setting set `_jupyter` to false
|
302
|
-
exit_handles: list[MailboxHandle[pb.Result]] = []
|
303
|
-
|
304
|
-
# only finish started streams, non started streams failed early
|
305
|
-
started_streams: dict[str, StreamRecord] = {}
|
306
|
-
not_started_streams: dict[str, StreamRecord] = {}
|
307
|
-
for stream_id, stream in streams.items():
|
308
|
-
d = started_streams if stream._started else not_started_streams
|
309
|
-
d[stream_id] = stream
|
310
|
-
|
311
|
-
for stream in started_streams.values():
|
312
|
-
handle = stream.interface.deliver_exit(exit_code)
|
313
|
-
exit_handles.append(handle)
|
314
|
-
|
315
|
-
with progress.progress_printer(
|
316
|
-
printer,
|
317
|
-
default_text="Finishing up...",
|
318
|
-
) as progress_printer:
|
319
|
-
# todo: should we wait for the max timeout (?) of all exit handles or just wait forever?
|
320
|
-
# timeout = max(stream._settings._exit_timeout for stream in streams.values())
|
321
|
-
wait_all_with_progress(
|
322
|
-
exit_handles,
|
323
|
-
timeout=None,
|
324
|
-
progress_after=1,
|
325
|
-
display_progress=functools.partial(
|
326
|
-
self._finish_all_progress,
|
327
|
-
progress_printer,
|
328
|
-
started_streams,
|
329
|
-
),
|
330
|
-
)
|
331
|
-
|
332
|
-
# These could be done in parallel in the future
|
333
|
-
for _sid, stream in started_streams.items():
|
334
|
-
# dispatch all our final requests
|
335
|
-
poll_exit_handle = stream.interface.deliver_poll_exit()
|
336
|
-
final_summary_handle = stream.interface.deliver_get_summary()
|
337
|
-
sampled_history_handle = stream.interface.deliver_request_sampled_history()
|
338
|
-
internal_messages_handle = stream.interface.deliver_internal_messages()
|
339
|
-
|
340
|
-
result = internal_messages_handle.wait_or(timeout=None)
|
341
|
-
internal_messages_response = result.response.internal_messages_response
|
342
|
-
|
343
|
-
result = poll_exit_handle.wait_or(timeout=None)
|
344
|
-
poll_exit_response = result.response.poll_exit_response
|
345
|
-
|
346
|
-
result = sampled_history_handle.wait_or(timeout=None)
|
347
|
-
sampled_history = result.response.sampled_history_response
|
348
|
-
|
349
|
-
result = final_summary_handle.wait_or(timeout=None)
|
350
|
-
final_summary = result.response.get_summary_response
|
351
|
-
|
352
|
-
Run._footer(
|
353
|
-
sampled_history=sampled_history,
|
354
|
-
final_summary=final_summary,
|
355
|
-
poll_exit_response=poll_exit_response,
|
356
|
-
internal_messages_response=internal_messages_response,
|
357
|
-
settings=stream._settings, # type: ignore
|
358
|
-
printer=printer,
|
359
|
-
)
|
360
|
-
stream.join()
|
361
|
-
|
362
|
-
# not started streams need to be cleaned up
|
363
|
-
for stream in not_started_streams.values():
|
364
|
-
stream.join()
|
365
|
-
|
366
|
-
def _process_teardown(self, action: StreamAction) -> None:
|
367
|
-
exit_code: int = action._data
|
368
|
-
with self._streams_lock:
|
369
|
-
# TODO: mark streams to prevent new modifications?
|
370
|
-
streams_copy = self._streams.copy()
|
371
|
-
self._finish_all(streams_copy, exit_code)
|
372
|
-
with self._streams_lock:
|
373
|
-
self._streams = dict()
|
374
|
-
self._stopped.set()
|
375
|
-
|
376
|
-
def _process_action(self, action: StreamAction) -> None:
|
377
|
-
if action._action == "add":
|
378
|
-
self._process_add(action)
|
379
|
-
return
|
380
|
-
if action._action == "update":
|
381
|
-
self._process_update(action)
|
382
|
-
return
|
383
|
-
if action._action == "start":
|
384
|
-
self._process_start(action)
|
385
|
-
return
|
386
|
-
if action._action == "del":
|
387
|
-
self._process_del(action)
|
388
|
-
return
|
389
|
-
if action._action == "drop":
|
390
|
-
self._process_drop(action)
|
391
|
-
return
|
392
|
-
if action._action == "teardown":
|
393
|
-
self._process_teardown(action)
|
394
|
-
return
|
395
|
-
raise AssertionError(f"Unsupported action: {action._action}")
|
396
|
-
|
397
|
-
def _check_orphaned(self) -> bool:
|
398
|
-
if not self._pid:
|
399
|
-
return False
|
400
|
-
time_now = time.time()
|
401
|
-
# if we have checked already and it was less than 2 seconds ago
|
402
|
-
if self._pid_checked_ts and time_now < self._pid_checked_ts + 2:
|
403
|
-
return False
|
404
|
-
self._pid_checked_ts = time_now
|
405
|
-
return not psutil.pid_exists(self._pid)
|
406
|
-
|
407
|
-
def _loop(self) -> None:
|
408
|
-
while not self._stopped.is_set():
|
409
|
-
if self._check_orphaned():
|
410
|
-
# parent process is gone, let other threads know we need to shut down
|
411
|
-
self._stopped.set()
|
412
|
-
try:
|
413
|
-
action = self._action_q.get(timeout=1)
|
414
|
-
except queue.Empty:
|
415
|
-
continue
|
416
|
-
self._process_action(action)
|
417
|
-
action.set_handled()
|
418
|
-
self._action_q.task_done()
|
419
|
-
self._action_q.join()
|
420
|
-
|
421
|
-
def loop(self) -> None:
|
422
|
-
self._loop()
|
423
|
-
|
424
|
-
def cleanup(self) -> None:
|
425
|
-
pass
|
File without changes
|
File without changes
|
File without changes
|