wandb 0.20.1__py3-none-win32.whl → 0.20.2rc20250616__py3-none-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -6
- wandb/__init__.pyi +1 -1
- wandb/analytics/sentry.py +2 -2
- wandb/apis/importers/internals/internal.py +0 -3
- wandb/apis/public/api.py +2 -2
- wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
- wandb/apis/public/registries/registries_search.py +2 -2
- wandb/apis/public/registries/registry.py +19 -18
- wandb/bin/gpu_stats.exe +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/beta.py +1 -7
- wandb/cli/cli.py +0 -30
- wandb/env.py +0 -6
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/internal/handler.py +1 -69
- wandb/sdk/lib/printer.py +6 -7
- wandb/sdk/lib/progress.py +1 -3
- wandb/sdk/lib/service/ipc_support.py +13 -0
- wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
- wandb/sdk/lib/service/service_port_file.py +105 -0
- wandb/sdk/lib/service/service_process.py +111 -0
- wandb/sdk/lib/service/service_token.py +164 -0
- wandb/sdk/lib/sock_client.py +8 -12
- wandb/sdk/wandb_init.py +0 -3
- wandb/sdk/wandb_require.py +9 -20
- wandb/sdk/wandb_run.py +0 -24
- wandb/sdk/wandb_settings.py +0 -9
- wandb/sdk/wandb_setup.py +2 -13
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +42 -68
- wandb/sdk/internal/flow_control.py +0 -263
- wandb/sdk/internal/internal.py +0 -401
- wandb/sdk/internal/internal_util.py +0 -97
- wandb/sdk/internal/system/__init__.py +0 -0
- wandb/sdk/internal/system/assets/__init__.py +0 -25
- wandb/sdk/internal/system/assets/aggregators.py +0 -31
- wandb/sdk/internal/system/assets/asset_registry.py +0 -20
- wandb/sdk/internal/system/assets/cpu.py +0 -163
- wandb/sdk/internal/system/assets/disk.py +0 -210
- wandb/sdk/internal/system/assets/gpu.py +0 -416
- wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
- wandb/sdk/internal/system/assets/interfaces.py +0 -205
- wandb/sdk/internal/system/assets/ipu.py +0 -177
- wandb/sdk/internal/system/assets/memory.py +0 -166
- wandb/sdk/internal/system/assets/network.py +0 -125
- wandb/sdk/internal/system/assets/open_metrics.py +0 -293
- wandb/sdk/internal/system/assets/tpu.py +0 -154
- wandb/sdk/internal/system/assets/trainium.py +0 -393
- wandb/sdk/internal/system/env_probe_helpers.py +0 -13
- wandb/sdk/internal/system/system_info.py +0 -248
- wandb/sdk/internal/system/system_monitor.py +0 -224
- wandb/sdk/internal/writer.py +0 -204
- wandb/sdk/lib/service_token.py +0 -93
- wandb/sdk/service/__init__.py +0 -0
- wandb/sdk/service/_startup_debug.py +0 -22
- wandb/sdk/service/port_file.py +0 -53
- wandb/sdk/service/server.py +0 -107
- wandb/sdk/service/server_sock.py +0 -286
- wandb/sdk/service/service.py +0 -252
- wandb/sdk/service/streams.py +0 -425
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0
@@ -1,263 +0,0 @@
|
|
1
|
-
"""Flow Control.
|
2
|
-
|
3
|
-
States:
|
4
|
-
FORWARDING
|
5
|
-
PAUSING
|
6
|
-
|
7
|
-
New messages:
|
8
|
-
pb.SenderMarkRequest writer -> sender (empty message)
|
9
|
-
pb.StatusReportRequest sender -> writer (reports current sender progress)
|
10
|
-
pb.SenderReadRequest writer -> sender (requests read of transaction log)
|
11
|
-
|
12
|
-
Thresholds:
|
13
|
-
Threshold_High_MaxOutstandingData - When above this, stop sending requests to sender
|
14
|
-
Threshold_Mid_StartSendingReadRequests - When below this, start sending read requests
|
15
|
-
Threshold_Low_RestartSendingData - When below this, start sending normal records
|
16
|
-
|
17
|
-
State machine:
|
18
|
-
FORWARDING
|
19
|
-
-> PAUSED if should_pause
|
20
|
-
There is too much work outstanding to the sender thread, after the current request
|
21
|
-
lets stop sending data.
|
22
|
-
PAUSING
|
23
|
-
-> FORWARDING if should_unpause
|
24
|
-
-> PAUSING if should_recover
|
25
|
-
-> PAUSING if should_quiesce
|
26
|
-
|
27
|
-
"""
|
28
|
-
|
29
|
-
import logging
|
30
|
-
from dataclasses import dataclass
|
31
|
-
from typing import TYPE_CHECKING, Callable, Optional
|
32
|
-
|
33
|
-
from wandb.proto import wandb_internal_pb2 as pb
|
34
|
-
from wandb.sdk.lib import fsm
|
35
|
-
|
36
|
-
from .settings_static import SettingsStatic
|
37
|
-
|
38
|
-
if TYPE_CHECKING:
|
39
|
-
from wandb.proto.wandb_internal_pb2 import Record
|
40
|
-
|
41
|
-
logger = logging.getLogger(__name__)
|
42
|
-
|
43
|
-
# By default we will allow 400 MiB of requests in the sender queue
|
44
|
-
# before falling back to the transaction log.
|
45
|
-
DEFAULT_THRESHOLD = 128 * 1024 * 1024 # 128 MiB
|
46
|
-
|
47
|
-
|
48
|
-
def _get_request_type(record: "Record") -> Optional[str]:
|
49
|
-
record_type = record.WhichOneof("record_type")
|
50
|
-
if record_type != "request":
|
51
|
-
return None
|
52
|
-
request_type = record.request.WhichOneof("request_type")
|
53
|
-
return request_type
|
54
|
-
|
55
|
-
|
56
|
-
def _is_control_record(record: "Record") -> bool:
|
57
|
-
return record.control.flow_control
|
58
|
-
|
59
|
-
|
60
|
-
def _is_local_non_control_record(record: "Record") -> bool:
|
61
|
-
return record.control.local and not record.control.flow_control
|
62
|
-
|
63
|
-
|
64
|
-
@dataclass
|
65
|
-
class StateContext:
|
66
|
-
last_forwarded_offset: int = 0
|
67
|
-
last_sent_offset: int = 0
|
68
|
-
last_written_offset: int = 0
|
69
|
-
|
70
|
-
|
71
|
-
class FlowControl:
|
72
|
-
_fsm: fsm.FsmWithContext["Record", StateContext]
|
73
|
-
|
74
|
-
def __init__(
|
75
|
-
self,
|
76
|
-
settings: SettingsStatic,
|
77
|
-
forward_record: Callable[["Record"], None],
|
78
|
-
write_record: Callable[["Record"], int],
|
79
|
-
pause_marker: Callable[[], None],
|
80
|
-
recover_records: Callable[[int, int], None],
|
81
|
-
_threshold_bytes_high: int = 0,
|
82
|
-
_threshold_bytes_mid: int = 0,
|
83
|
-
_threshold_bytes_low: int = 0,
|
84
|
-
) -> None:
|
85
|
-
# thresholds to define when to PAUSE, RESTART, FORWARDING
|
86
|
-
if (
|
87
|
-
_threshold_bytes_high == 0
|
88
|
-
or _threshold_bytes_mid == 0
|
89
|
-
or _threshold_bytes_low == 0
|
90
|
-
):
|
91
|
-
threshold = settings.x_network_buffer or DEFAULT_THRESHOLD
|
92
|
-
_threshold_bytes_high = threshold
|
93
|
-
_threshold_bytes_mid = threshold // 2
|
94
|
-
_threshold_bytes_low = threshold // 4
|
95
|
-
assert _threshold_bytes_high > _threshold_bytes_mid > _threshold_bytes_low
|
96
|
-
|
97
|
-
# FSM definition
|
98
|
-
state_forwarding = StateForwarding(
|
99
|
-
forward_record=forward_record,
|
100
|
-
pause_marker=pause_marker,
|
101
|
-
threshold_pause=_threshold_bytes_high,
|
102
|
-
)
|
103
|
-
state_pausing = StatePausing(
|
104
|
-
forward_record=forward_record,
|
105
|
-
recover_records=recover_records,
|
106
|
-
threshold_recover=_threshold_bytes_mid,
|
107
|
-
threshold_forward=_threshold_bytes_low,
|
108
|
-
)
|
109
|
-
self._fsm = fsm.FsmWithContext(
|
110
|
-
states=[state_forwarding, state_pausing],
|
111
|
-
table={
|
112
|
-
StateForwarding: [
|
113
|
-
fsm.FsmEntry(
|
114
|
-
state_forwarding._should_pause,
|
115
|
-
StatePausing,
|
116
|
-
state_forwarding._pause,
|
117
|
-
),
|
118
|
-
],
|
119
|
-
StatePausing: [
|
120
|
-
fsm.FsmEntry(
|
121
|
-
state_pausing._should_unpause,
|
122
|
-
StateForwarding,
|
123
|
-
state_pausing._unpause,
|
124
|
-
),
|
125
|
-
fsm.FsmEntry(
|
126
|
-
state_pausing._should_recover,
|
127
|
-
StatePausing,
|
128
|
-
state_pausing._recover,
|
129
|
-
),
|
130
|
-
fsm.FsmEntry(
|
131
|
-
state_pausing._should_quiesce,
|
132
|
-
StatePausing,
|
133
|
-
state_pausing._quiesce,
|
134
|
-
),
|
135
|
-
],
|
136
|
-
},
|
137
|
-
)
|
138
|
-
|
139
|
-
def flush(self) -> None:
|
140
|
-
# TODO(mempressure): what do we do here, how do we make sure we dont have work in pause state
|
141
|
-
pass
|
142
|
-
|
143
|
-
def flow(self, record: "Record") -> None:
|
144
|
-
self._fsm.input(record)
|
145
|
-
|
146
|
-
|
147
|
-
class StateShared:
|
148
|
-
_context: StateContext
|
149
|
-
|
150
|
-
def __init__(self) -> None:
|
151
|
-
self._context = StateContext()
|
152
|
-
|
153
|
-
def _update_written_offset(self, record: "Record") -> None:
|
154
|
-
end_offset = record.control.end_offset
|
155
|
-
if end_offset:
|
156
|
-
self._context.last_written_offset = end_offset
|
157
|
-
|
158
|
-
def _update_forwarded_offset(self) -> None:
|
159
|
-
self._context.last_forwarded_offset = self._context.last_written_offset
|
160
|
-
|
161
|
-
def _process(self, record: "Record") -> None:
|
162
|
-
request_type = _get_request_type(record)
|
163
|
-
if not request_type:
|
164
|
-
return
|
165
|
-
process_str = f"_process_{request_type}"
|
166
|
-
process_handler: Optional[Callable[[pb.Record], None]] = getattr(
|
167
|
-
self, process_str, None
|
168
|
-
)
|
169
|
-
if not process_handler:
|
170
|
-
return
|
171
|
-
process_handler(record)
|
172
|
-
|
173
|
-
def _process_status_report(self, record: "Record") -> None:
|
174
|
-
sent_offset = record.request.status_report.sent_offset
|
175
|
-
self._context.last_sent_offset = sent_offset
|
176
|
-
|
177
|
-
def on_exit(self, record: "Record") -> StateContext:
|
178
|
-
return self._context
|
179
|
-
|
180
|
-
def on_enter(self, record: "Record", context: StateContext) -> None:
|
181
|
-
self._context = context
|
182
|
-
|
183
|
-
@property
|
184
|
-
def _behind_bytes(self) -> int:
|
185
|
-
return self._context.last_forwarded_offset - self._context.last_sent_offset
|
186
|
-
|
187
|
-
|
188
|
-
class StateForwarding(StateShared):
|
189
|
-
_forward_record: Callable[["Record"], None]
|
190
|
-
_pause_marker: Callable[[], None]
|
191
|
-
_threshold_pause: int
|
192
|
-
|
193
|
-
def __init__(
|
194
|
-
self,
|
195
|
-
forward_record: Callable[["Record"], None],
|
196
|
-
pause_marker: Callable[[], None],
|
197
|
-
threshold_pause: int,
|
198
|
-
) -> None:
|
199
|
-
super().__init__()
|
200
|
-
self._forward_record = forward_record
|
201
|
-
self._pause_marker = pause_marker
|
202
|
-
self._threshold_pause = threshold_pause
|
203
|
-
|
204
|
-
def _should_pause(self, record: "Record") -> bool:
|
205
|
-
return self._behind_bytes >= self._threshold_pause
|
206
|
-
|
207
|
-
def _pause(self, record: "Record") -> None:
|
208
|
-
self._pause_marker()
|
209
|
-
|
210
|
-
def on_check(self, record: "Record") -> None:
|
211
|
-
self._update_written_offset(record)
|
212
|
-
self._process(record)
|
213
|
-
if not _is_control_record(record):
|
214
|
-
self._forward_record(record)
|
215
|
-
self._update_forwarded_offset()
|
216
|
-
|
217
|
-
|
218
|
-
class StatePausing(StateShared):
|
219
|
-
_forward_record: Callable[["Record"], None]
|
220
|
-
_recover_records: Callable[[int, int], None]
|
221
|
-
_threshold_recover: int
|
222
|
-
_threshold_forward: int
|
223
|
-
|
224
|
-
def __init__(
|
225
|
-
self,
|
226
|
-
forward_record: Callable[["Record"], None],
|
227
|
-
recover_records: Callable[[int, int], None],
|
228
|
-
threshold_recover: int,
|
229
|
-
threshold_forward: int,
|
230
|
-
) -> None:
|
231
|
-
super().__init__()
|
232
|
-
self._forward_record = forward_record
|
233
|
-
self._recover_records = recover_records
|
234
|
-
self._threshold_recover = threshold_recover
|
235
|
-
self._threshold_forward = threshold_forward
|
236
|
-
|
237
|
-
def _should_unpause(self, record: "Record") -> bool:
|
238
|
-
return self._behind_bytes < self._threshold_forward
|
239
|
-
|
240
|
-
def _unpause(self, record: "Record") -> None:
|
241
|
-
self._quiesce(record)
|
242
|
-
|
243
|
-
def _should_recover(self, record: "Record") -> bool:
|
244
|
-
return self._behind_bytes < self._threshold_recover
|
245
|
-
|
246
|
-
def _recover(self, record: "Record") -> None:
|
247
|
-
self._quiesce(record)
|
248
|
-
|
249
|
-
def _should_quiesce(self, record: "Record") -> bool:
|
250
|
-
return _is_local_non_control_record(record)
|
251
|
-
|
252
|
-
def _quiesce(self, record: "Record") -> None:
|
253
|
-
start = self._context.last_forwarded_offset
|
254
|
-
end = self._context.last_written_offset
|
255
|
-
if start != end:
|
256
|
-
self._recover_records(start, end)
|
257
|
-
if _is_local_non_control_record(record):
|
258
|
-
self._forward_record(record)
|
259
|
-
self._update_forwarded_offset()
|
260
|
-
|
261
|
-
def on_check(self, record: "Record") -> None:
|
262
|
-
self._update_written_offset(record)
|
263
|
-
self._process(record)
|
wandb/sdk/internal/internal.py
DELETED
@@ -1,401 +0,0 @@
|
|
1
|
-
#
|
2
|
-
"""Internal process.
|
3
|
-
|
4
|
-
This module implements the entrypoint for the internal process. The internal process
|
5
|
-
is responsible for handling "record" requests, and responding with "results". Data is
|
6
|
-
passed to the process over multiprocessing queues.
|
7
|
-
|
8
|
-
Threads:
|
9
|
-
HandlerThread -- read from record queue and call handlers
|
10
|
-
SenderThread -- send to network
|
11
|
-
WriterThread -- write to disk
|
12
|
-
|
13
|
-
"""
|
14
|
-
|
15
|
-
import atexit
|
16
|
-
import logging
|
17
|
-
import os
|
18
|
-
import queue
|
19
|
-
import sys
|
20
|
-
import threading
|
21
|
-
import time
|
22
|
-
import traceback
|
23
|
-
from datetime import datetime
|
24
|
-
from typing import TYPE_CHECKING, Any, List, Optional
|
25
|
-
|
26
|
-
import psutil
|
27
|
-
|
28
|
-
import wandb
|
29
|
-
|
30
|
-
from ..interface.interface_queue import InterfaceQueue
|
31
|
-
from . import context, handler, internal_util, sender, writer
|
32
|
-
|
33
|
-
if TYPE_CHECKING:
|
34
|
-
from queue import Queue
|
35
|
-
from threading import Event
|
36
|
-
|
37
|
-
from wandb.proto.wandb_internal_pb2 import Record, Result
|
38
|
-
|
39
|
-
from .internal_util import RecordLoopThread
|
40
|
-
from .settings_static import SettingsStatic
|
41
|
-
|
42
|
-
|
43
|
-
logger = logging.getLogger(__name__)
|
44
|
-
|
45
|
-
|
46
|
-
def wandb_internal(
|
47
|
-
settings: "SettingsStatic",
|
48
|
-
record_q: "Queue[Record]",
|
49
|
-
result_q: "Queue[Result]",
|
50
|
-
port: Optional[int] = None,
|
51
|
-
user_pid: Optional[int] = None,
|
52
|
-
) -> None:
|
53
|
-
"""Internal process function entrypoint.
|
54
|
-
|
55
|
-
Read from record queue and dispatch work to various threads.
|
56
|
-
|
57
|
-
Args:
|
58
|
-
settings: settings object
|
59
|
-
record_q: records to be handled
|
60
|
-
result_q: for sending results back
|
61
|
-
|
62
|
-
"""
|
63
|
-
# mark this process as internal
|
64
|
-
wandb._set_internal_process() # type: ignore
|
65
|
-
started = time.time()
|
66
|
-
|
67
|
-
# any sentry events in the internal process will be tagged as such
|
68
|
-
wandb._sentry.configure_scope(process_context="internal", tags=dict(settings))
|
69
|
-
|
70
|
-
# register the exit handler only when wandb_internal is called, not on import
|
71
|
-
@atexit.register
|
72
|
-
def handle_exit(*args: "Any") -> None:
|
73
|
-
logger.info("Internal process exited")
|
74
|
-
|
75
|
-
# Let's make sure we don't modify settings so use a static object
|
76
|
-
_settings = settings
|
77
|
-
if _settings.log_internal:
|
78
|
-
configure_logging(_settings.log_internal, _settings.x_log_level)
|
79
|
-
|
80
|
-
user_pid = user_pid or os.getppid()
|
81
|
-
pid = os.getpid()
|
82
|
-
|
83
|
-
logger.info(
|
84
|
-
"W&B internal server running at pid: %s, started at: %s",
|
85
|
-
pid,
|
86
|
-
datetime.fromtimestamp(started),
|
87
|
-
)
|
88
|
-
|
89
|
-
publish_interface = InterfaceQueue(record_q=record_q)
|
90
|
-
|
91
|
-
stopped = threading.Event()
|
92
|
-
threads: List[RecordLoopThread] = []
|
93
|
-
|
94
|
-
context_keeper = context.ContextKeeper()
|
95
|
-
|
96
|
-
send_record_q: Queue[Record] = queue.Queue()
|
97
|
-
|
98
|
-
write_record_q: Queue[Record] = queue.Queue()
|
99
|
-
|
100
|
-
record_sender_thread = SenderThread(
|
101
|
-
settings=_settings,
|
102
|
-
record_q=send_record_q,
|
103
|
-
result_q=result_q,
|
104
|
-
stopped=stopped,
|
105
|
-
interface=publish_interface,
|
106
|
-
debounce_interval_ms=5000,
|
107
|
-
context_keeper=context_keeper,
|
108
|
-
)
|
109
|
-
threads.append(record_sender_thread)
|
110
|
-
|
111
|
-
record_writer_thread = WriterThread(
|
112
|
-
settings=_settings,
|
113
|
-
record_q=write_record_q,
|
114
|
-
result_q=result_q,
|
115
|
-
stopped=stopped,
|
116
|
-
interface=publish_interface,
|
117
|
-
sender_q=send_record_q,
|
118
|
-
context_keeper=context_keeper,
|
119
|
-
)
|
120
|
-
threads.append(record_writer_thread)
|
121
|
-
|
122
|
-
record_handler_thread = HandlerThread(
|
123
|
-
settings=_settings,
|
124
|
-
record_q=record_q,
|
125
|
-
result_q=result_q,
|
126
|
-
stopped=stopped,
|
127
|
-
writer_q=write_record_q,
|
128
|
-
interface=publish_interface,
|
129
|
-
context_keeper=context_keeper,
|
130
|
-
)
|
131
|
-
threads.append(record_handler_thread)
|
132
|
-
|
133
|
-
process_check = ProcessCheck(settings=_settings, user_pid=user_pid)
|
134
|
-
|
135
|
-
for thread in threads:
|
136
|
-
thread.start()
|
137
|
-
|
138
|
-
interrupt_count = 0
|
139
|
-
while not stopped.is_set():
|
140
|
-
try:
|
141
|
-
# wait for stop event
|
142
|
-
while not stopped.is_set():
|
143
|
-
time.sleep(1)
|
144
|
-
if process_check.is_dead():
|
145
|
-
logger.error("Internal process shutdown.")
|
146
|
-
stopped.set()
|
147
|
-
except KeyboardInterrupt:
|
148
|
-
interrupt_count += 1
|
149
|
-
logger.warning(f"Internal process interrupt: {interrupt_count}")
|
150
|
-
finally:
|
151
|
-
if interrupt_count >= 2:
|
152
|
-
logger.error("Internal process interrupted.")
|
153
|
-
stopped.set()
|
154
|
-
|
155
|
-
for thread in threads:
|
156
|
-
thread.join()
|
157
|
-
|
158
|
-
def close_internal_log() -> None:
|
159
|
-
root = logging.getLogger("wandb")
|
160
|
-
for _handler in root.handlers[:]:
|
161
|
-
_handler.close()
|
162
|
-
root.removeHandler(_handler)
|
163
|
-
|
164
|
-
for thread in threads:
|
165
|
-
exc_info = thread.get_exception()
|
166
|
-
if exc_info:
|
167
|
-
logger.error(f"Thread {thread.name}:", exc_info=exc_info)
|
168
|
-
print(f"Thread {thread.name}:", file=sys.stderr) # noqa: T201
|
169
|
-
traceback.print_exception(*exc_info)
|
170
|
-
wandb._sentry.exception(exc_info)
|
171
|
-
wandb.termerror("Internal wandb error: file data was not synced")
|
172
|
-
# TODO: We can make this more graceful by returning an error to streams.py
|
173
|
-
# and potentially just fail the one stream.
|
174
|
-
os._exit(-1)
|
175
|
-
|
176
|
-
close_internal_log()
|
177
|
-
|
178
|
-
|
179
|
-
def configure_logging(
|
180
|
-
log_fname: str, log_level: int, run_id: Optional[str] = None
|
181
|
-
) -> None:
|
182
|
-
# TODO: we may want make prints and stdout make it into the logs
|
183
|
-
# sys.stdout = open(settings.log_internal, "a")
|
184
|
-
# sys.stderr = open(settings.log_internal, "a")
|
185
|
-
log_handler = logging.FileHandler(log_fname)
|
186
|
-
log_handler.setLevel(log_level)
|
187
|
-
|
188
|
-
class WBFilter(logging.Filter):
|
189
|
-
def filter(self, record: "Any") -> bool:
|
190
|
-
record.run_id = run_id
|
191
|
-
return True
|
192
|
-
|
193
|
-
if run_id:
|
194
|
-
formatter = logging.Formatter(
|
195
|
-
"%(asctime)s %(levelname)-7s %(threadName)-10s:%(process)d "
|
196
|
-
"[%(run_id)s:%(filename)s:%(funcName)s():%(lineno)s] %(message)s"
|
197
|
-
)
|
198
|
-
else:
|
199
|
-
formatter = logging.Formatter(
|
200
|
-
"%(asctime)s %(levelname)-7s %(threadName)-10s:%(process)d "
|
201
|
-
"[%(filename)s:%(funcName)s():%(lineno)s] %(message)s"
|
202
|
-
)
|
203
|
-
|
204
|
-
log_handler.setFormatter(formatter)
|
205
|
-
if run_id:
|
206
|
-
log_handler.addFilter(WBFilter())
|
207
|
-
# If this is called without "wandb", backend logs from this module
|
208
|
-
# are not streamed to `debug-internal.log` when we spawn with fork
|
209
|
-
# TODO: (cvp) we should really take another pass at logging in general
|
210
|
-
root = logging.getLogger("wandb")
|
211
|
-
root.propagate = False
|
212
|
-
root.setLevel(logging.DEBUG)
|
213
|
-
root.addHandler(log_handler)
|
214
|
-
|
215
|
-
|
216
|
-
class HandlerThread(internal_util.RecordLoopThread):
|
217
|
-
"""Read records from queue and dispatch to handler routines."""
|
218
|
-
|
219
|
-
_record_q: "Queue[Record]"
|
220
|
-
_result_q: "Queue[Result]"
|
221
|
-
_stopped: "Event"
|
222
|
-
_context_keeper: context.ContextKeeper
|
223
|
-
|
224
|
-
def __init__(
|
225
|
-
self,
|
226
|
-
settings: "SettingsStatic",
|
227
|
-
record_q: "Queue[Record]",
|
228
|
-
result_q: "Queue[Result]",
|
229
|
-
stopped: "Event",
|
230
|
-
writer_q: "Queue[Record]",
|
231
|
-
interface: "InterfaceQueue",
|
232
|
-
context_keeper: context.ContextKeeper,
|
233
|
-
debounce_interval_ms: "float" = 1000,
|
234
|
-
) -> None:
|
235
|
-
super().__init__(
|
236
|
-
input_record_q=record_q,
|
237
|
-
result_q=result_q,
|
238
|
-
stopped=stopped,
|
239
|
-
debounce_interval_ms=debounce_interval_ms,
|
240
|
-
)
|
241
|
-
self.name = "HandlerThread"
|
242
|
-
self._settings = settings
|
243
|
-
self._record_q = record_q
|
244
|
-
self._result_q = result_q
|
245
|
-
self._stopped = stopped
|
246
|
-
self._writer_q = writer_q
|
247
|
-
self._interface = interface
|
248
|
-
self._context_keeper = context_keeper
|
249
|
-
|
250
|
-
def _setup(self) -> None:
|
251
|
-
self._hm = handler.HandleManager(
|
252
|
-
settings=self._settings,
|
253
|
-
record_q=self._record_q,
|
254
|
-
result_q=self._result_q,
|
255
|
-
stopped=self._stopped,
|
256
|
-
writer_q=self._writer_q,
|
257
|
-
interface=self._interface,
|
258
|
-
context_keeper=self._context_keeper,
|
259
|
-
)
|
260
|
-
|
261
|
-
def _process(self, record: "Record") -> None:
|
262
|
-
self._hm.handle(record)
|
263
|
-
|
264
|
-
def _finish(self) -> None:
|
265
|
-
self._hm.finish()
|
266
|
-
|
267
|
-
def _debounce(self) -> None:
|
268
|
-
self._hm.debounce()
|
269
|
-
|
270
|
-
|
271
|
-
class SenderThread(internal_util.RecordLoopThread):
|
272
|
-
"""Read records from queue and dispatch to sender routines."""
|
273
|
-
|
274
|
-
_record_q: "Queue[Record]"
|
275
|
-
_result_q: "Queue[Result]"
|
276
|
-
_context_keeper: context.ContextKeeper
|
277
|
-
|
278
|
-
def __init__(
|
279
|
-
self,
|
280
|
-
settings: "SettingsStatic",
|
281
|
-
record_q: "Queue[Record]",
|
282
|
-
result_q: "Queue[Result]",
|
283
|
-
stopped: "Event",
|
284
|
-
interface: "InterfaceQueue",
|
285
|
-
context_keeper: context.ContextKeeper,
|
286
|
-
debounce_interval_ms: "float" = 5000,
|
287
|
-
) -> None:
|
288
|
-
super().__init__(
|
289
|
-
input_record_q=record_q,
|
290
|
-
result_q=result_q,
|
291
|
-
stopped=stopped,
|
292
|
-
debounce_interval_ms=debounce_interval_ms,
|
293
|
-
)
|
294
|
-
self.name = "SenderThread"
|
295
|
-
self._settings = settings
|
296
|
-
self._record_q = record_q
|
297
|
-
self._result_q = result_q
|
298
|
-
self._interface = interface
|
299
|
-
self._context_keeper = context_keeper
|
300
|
-
|
301
|
-
def _setup(self) -> None:
|
302
|
-
self._sm = sender.SendManager(
|
303
|
-
settings=self._settings,
|
304
|
-
record_q=self._record_q,
|
305
|
-
result_q=self._result_q,
|
306
|
-
interface=self._interface,
|
307
|
-
context_keeper=self._context_keeper,
|
308
|
-
)
|
309
|
-
|
310
|
-
def _process(self, record: "Record") -> None:
|
311
|
-
self._sm.send(record)
|
312
|
-
|
313
|
-
def _finish(self) -> None:
|
314
|
-
self._sm.finish()
|
315
|
-
|
316
|
-
def _debounce(self) -> None:
|
317
|
-
self._sm.debounce()
|
318
|
-
|
319
|
-
|
320
|
-
class WriterThread(internal_util.RecordLoopThread):
|
321
|
-
"""Read records from queue and dispatch to writer routines."""
|
322
|
-
|
323
|
-
_record_q: "Queue[Record]"
|
324
|
-
_result_q: "Queue[Result]"
|
325
|
-
_context_keeper: context.ContextKeeper
|
326
|
-
|
327
|
-
def __init__(
|
328
|
-
self,
|
329
|
-
settings: "SettingsStatic",
|
330
|
-
record_q: "Queue[Record]",
|
331
|
-
result_q: "Queue[Result]",
|
332
|
-
stopped: "Event",
|
333
|
-
interface: "InterfaceQueue",
|
334
|
-
sender_q: "Queue[Record]",
|
335
|
-
context_keeper: context.ContextKeeper,
|
336
|
-
debounce_interval_ms: "float" = 1000,
|
337
|
-
) -> None:
|
338
|
-
super().__init__(
|
339
|
-
input_record_q=record_q,
|
340
|
-
result_q=result_q,
|
341
|
-
stopped=stopped,
|
342
|
-
debounce_interval_ms=debounce_interval_ms,
|
343
|
-
)
|
344
|
-
self.name = "WriterThread"
|
345
|
-
self._settings = settings
|
346
|
-
self._record_q = record_q
|
347
|
-
self._result_q = result_q
|
348
|
-
self._sender_q = sender_q
|
349
|
-
self._interface = interface
|
350
|
-
self._context_keeper = context_keeper
|
351
|
-
|
352
|
-
def _setup(self) -> None:
|
353
|
-
self._wm = writer.WriteManager(
|
354
|
-
settings=self._settings,
|
355
|
-
record_q=self._record_q,
|
356
|
-
result_q=self._result_q,
|
357
|
-
sender_q=self._sender_q,
|
358
|
-
interface=self._interface,
|
359
|
-
context_keeper=self._context_keeper,
|
360
|
-
)
|
361
|
-
|
362
|
-
def _process(self, record: "Record") -> None:
|
363
|
-
self._wm.write(record)
|
364
|
-
|
365
|
-
def _finish(self) -> None:
|
366
|
-
self._wm.finish()
|
367
|
-
|
368
|
-
def _debounce(self) -> None:
|
369
|
-
self._wm.debounce()
|
370
|
-
|
371
|
-
|
372
|
-
class ProcessCheck:
|
373
|
-
"""Class to help watch a process id to detect when it is dead."""
|
374
|
-
|
375
|
-
check_process_last: Optional[float]
|
376
|
-
|
377
|
-
def __init__(self, settings: "SettingsStatic", user_pid: Optional[int]) -> None:
|
378
|
-
self.settings = settings
|
379
|
-
self.pid = user_pid
|
380
|
-
self.check_process_last = None
|
381
|
-
self.check_process_interval = settings.x_internal_check_process
|
382
|
-
|
383
|
-
def is_dead(self) -> bool:
|
384
|
-
if not self.check_process_interval or not self.pid:
|
385
|
-
return False
|
386
|
-
time_now = time.time()
|
387
|
-
if (
|
388
|
-
self.check_process_last
|
389
|
-
and time_now < self.check_process_last + self.check_process_interval
|
390
|
-
):
|
391
|
-
return False
|
392
|
-
self.check_process_last = time_now
|
393
|
-
|
394
|
-
# TODO(jhr): check for os.getppid on unix being 1?
|
395
|
-
exists = psutil.pid_exists(self.pid)
|
396
|
-
if not exists:
|
397
|
-
logger.warning(
|
398
|
-
f"Internal process exiting, parent pid {self.pid} disappeared"
|
399
|
-
)
|
400
|
-
return True
|
401
|
-
return False
|