wandb 0.19.2__py3-none-any.whl → 0.19.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +1 -1
- wandb/__init__.pyi +13 -3
- wandb/bin/gpu_stats +0 -0
- wandb/integration/metaflow/metaflow.py +7 -9
- wandb/sdk/backend/backend.py +7 -11
- wandb/sdk/interface/interface.py +28 -20
- wandb/sdk/interface/interface_sock.py +7 -11
- wandb/sdk/internal/tb_watcher.py +3 -1
- wandb/sdk/lib/service_connection.py +2 -2
- wandb/sdk/wandb_init.py +395 -240
- wandb/sdk/wandb_run.py +12 -20
- wandb/sdk/wandb_settings.py +2 -27
- {wandb-0.19.2.dist-info → wandb-0.19.4.dist-info}/METADATA +1 -1
- {wandb-0.19.2.dist-info → wandb-0.19.4.dist-info}/RECORD +17 -17
- {wandb-0.19.2.dist-info → wandb-0.19.4.dist-info}/WHEEL +0 -0
- {wandb-0.19.2.dist-info → wandb-0.19.4.dist-info}/entry_points.txt +0 -0
- {wandb-0.19.2.dist-info → wandb-0.19.4.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/wandb_init.py
CHANGED
@@ -11,9 +11,11 @@ For more on using `wandb.init()`, including code snippets, check out our
|
|
11
11
|
from __future__ import annotations
|
12
12
|
|
13
13
|
import copy
|
14
|
+
import dataclasses
|
14
15
|
import json
|
15
16
|
import logging
|
16
17
|
import os
|
18
|
+
import pathlib
|
17
19
|
import platform
|
18
20
|
import sys
|
19
21
|
import tempfile
|
@@ -48,14 +50,6 @@ from .wandb_settings import Settings
|
|
48
50
|
if TYPE_CHECKING:
|
49
51
|
from wandb.proto import wandb_internal_pb2 as pb
|
50
52
|
|
51
|
-
logger: wandb_setup.Logger | None = None # logger configured during wandb.init()
|
52
|
-
|
53
|
-
|
54
|
-
def _set_logger(log_object: wandb_setup.Logger | None) -> None:
|
55
|
-
"""Configure module logger."""
|
56
|
-
global logger
|
57
|
-
logger = log_object
|
58
|
-
|
59
53
|
|
60
54
|
def _huggingface_version() -> str | None:
|
61
55
|
if "transformers" in sys.modules:
|
@@ -112,27 +106,83 @@ def _handle_launch_config(settings: Settings) -> dict[str, Any]:
|
|
112
106
|
return launch_run_config
|
113
107
|
|
114
108
|
|
109
|
+
@dataclasses.dataclass(frozen=True)
|
110
|
+
class _ConfigParts:
|
111
|
+
base_no_artifacts: dict[str, Any]
|
112
|
+
"""The run config passed to `init()` minus any artifact-valued keys."""
|
113
|
+
|
114
|
+
sweep_no_artifacts: dict[str, Any]
|
115
|
+
"""The config loaded as part of a sweep minus any artifact-valued keys."""
|
116
|
+
|
117
|
+
launch_no_artifacts: dict[str, Any]
|
118
|
+
"""The config loaded as part of Launch minus any artifact-valued keys."""
|
119
|
+
|
120
|
+
artifacts: dict[str, Any]
|
121
|
+
"""Artifact keys removed from config dictionaries.
|
122
|
+
|
123
|
+
Due to implementation details of how a Run is constructed,
|
124
|
+
artifacts must be inserted into its config after initialization.
|
125
|
+
"""
|
126
|
+
|
127
|
+
|
115
128
|
class _WandbInit:
|
116
|
-
|
129
|
+
def __init__(
|
130
|
+
self,
|
131
|
+
wl: wandb_setup._WandbSetup,
|
132
|
+
telemetry: telemetry.TelemetryRecord,
|
133
|
+
) -> None:
|
134
|
+
self._wl = wl
|
135
|
+
|
136
|
+
self._telemetry = telemetry
|
137
|
+
"""Telemetry gathered before creating a run.
|
138
|
+
|
139
|
+
After the run is created, `telemetry.context()` is used instead.
|
140
|
+
"""
|
117
141
|
|
118
|
-
def __init__(self) -> None:
|
119
142
|
self.kwargs = None
|
120
|
-
self.settings: Settings | None = None
|
121
|
-
self.sweep_config: dict[str, Any] = {}
|
122
|
-
self.launch_config: dict[str, Any] = {}
|
123
|
-
self.config: dict[str, Any] = {}
|
124
143
|
self.run: Run | None = None
|
125
144
|
self.backend: Backend | None = None
|
126
145
|
|
127
146
|
self._teardown_hooks: list[TeardownHook] = []
|
128
|
-
self._wl = wandb.setup()
|
129
147
|
self.notebook: wandb.jupyter.Notebook | None = None # type: ignore
|
130
148
|
self.printer = printer.new_printer()
|
131
149
|
|
132
|
-
self._init_telemetry_obj = telemetry.TelemetryRecord()
|
133
|
-
|
134
150
|
self.deprecated_features_used: dict[str, str] = dict()
|
135
151
|
|
152
|
+
@property
|
153
|
+
def _logger(self) -> wandb_setup.Logger:
|
154
|
+
return self._wl._get_logger()
|
155
|
+
|
156
|
+
def maybe_login(self, init_settings: Settings) -> None:
|
157
|
+
"""Log in if we are not creating an offline or disabled run.
|
158
|
+
|
159
|
+
This may change the W&B singleton settings.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
init_settings: Settings passed to `wandb.init()` or set via
|
163
|
+
keyword arguments.
|
164
|
+
"""
|
165
|
+
# Allow settings passed to init() to override inferred values.
|
166
|
+
#
|
167
|
+
# Calling login() may change settings on the singleton,
|
168
|
+
# so these may not be the final run settings.
|
169
|
+
run_settings = self._wl.settings.model_copy()
|
170
|
+
run_settings.update_from_settings(init_settings)
|
171
|
+
|
172
|
+
# NOTE: _noop or _offline can become true after _login().
|
173
|
+
# _noop happens if _login hits a timeout.
|
174
|
+
# _offline can be selected by the user at the login prompt.
|
175
|
+
if run_settings._noop or run_settings._offline:
|
176
|
+
return
|
177
|
+
|
178
|
+
wandb_login._login(
|
179
|
+
anonymous=run_settings.anonymous,
|
180
|
+
force=run_settings.force,
|
181
|
+
_disable_warning=True,
|
182
|
+
_silent=run_settings.quiet or run_settings.silent,
|
183
|
+
_entity=run_settings.entity,
|
184
|
+
)
|
185
|
+
|
136
186
|
def warn_env_vars_change_after_setup(self) -> None:
|
137
187
|
"""Warn if environment variables change after wandb singleton is initialized.
|
138
188
|
|
@@ -202,24 +252,15 @@ class _WandbInit:
|
|
202
252
|
warn("run_id", init_settings.run_id)
|
203
253
|
init_settings.run_id = None
|
204
254
|
|
205
|
-
def
|
206
|
-
|
207
|
-
init_settings: Settings,
|
208
|
-
config: dict | str | None = None,
|
209
|
-
config_exclude_keys: list[str] | None = None,
|
210
|
-
config_include_keys: list[str] | None = None,
|
211
|
-
allow_val_change: bool | None = None,
|
212
|
-
monitor_gym: bool | None = None,
|
213
|
-
) -> None:
|
214
|
-
"""Complete setup for `wandb.init()`.
|
255
|
+
def make_run_settings(self, init_settings: Settings) -> Settings:
|
256
|
+
"""Returns the run's settings.
|
215
257
|
|
216
|
-
|
258
|
+
Args:
|
259
|
+
init_settings: Settings passed to `wandb.init()` or set via
|
260
|
+
keyword arguments.
|
217
261
|
"""
|
218
262
|
self.warn_env_vars_change_after_setup()
|
219
263
|
|
220
|
-
_set_logger(self._wl._get_logger())
|
221
|
-
assert logger
|
222
|
-
|
223
264
|
self.clear_run_path_if_sweep_or_launch(init_settings)
|
224
265
|
|
225
266
|
# Inherit global settings.
|
@@ -231,20 +272,129 @@ class _WandbInit:
|
|
231
272
|
# Infer the run ID from SageMaker.
|
232
273
|
if not settings.sagemaker_disable and sagemaker.is_using_sagemaker():
|
233
274
|
if sagemaker.set_run_id(settings):
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
if
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
275
|
+
self._logger.info("set run ID and group based on SageMaker")
|
276
|
+
self._telemetry.feature.sagemaker = True
|
277
|
+
|
278
|
+
# get status of code saving before applying user settings
|
279
|
+
save_code_pre_user_settings = settings.save_code
|
280
|
+
if not settings._offline and not settings._noop:
|
281
|
+
user_settings = self._wl._load_user_settings()
|
282
|
+
if user_settings is not None:
|
283
|
+
settings.update_from_dict(user_settings)
|
284
|
+
|
285
|
+
# ensure that user settings don't set saving to true
|
286
|
+
# if user explicitly set these to false in UI
|
287
|
+
if save_code_pre_user_settings is False:
|
288
|
+
settings.save_code = False
|
289
|
+
|
290
|
+
# TODO: remove this once we refactor the client. This is a temporary
|
291
|
+
# fix to make sure that we use the same project name for wandb-core.
|
292
|
+
# The reason this is not going through the settings object is to
|
293
|
+
# avoid failure cases in other parts of the code that will be
|
294
|
+
# removed with the switch to wandb-core.
|
295
|
+
if settings.project is None:
|
296
|
+
settings.project = wandb.util.auto_project_name(settings.program)
|
297
|
+
|
298
|
+
settings.x_start_time = time.time()
|
299
|
+
|
300
|
+
return settings
|
301
|
+
|
302
|
+
def _load_autoresume_run_id(self, resume_file: pathlib.Path) -> str | None:
|
303
|
+
"""Returns the run_id stored in the auto-resume file, if any.
|
247
304
|
|
305
|
+
Returns None if the file does not exist or is not in a valid format.
|
306
|
+
|
307
|
+
Args:
|
308
|
+
resume_file: The file path to use for resume='auto' mode.
|
309
|
+
"""
|
310
|
+
if not resume_file.exists():
|
311
|
+
return None
|
312
|
+
|
313
|
+
with resume_file.open() as f:
|
314
|
+
try:
|
315
|
+
return json.load(f)["run_id"]
|
316
|
+
|
317
|
+
except json.JSONDecodeError as e:
|
318
|
+
self._logger.exception(
|
319
|
+
f"could not decode {resume_file}, ignoring",
|
320
|
+
exc_info=e,
|
321
|
+
)
|
322
|
+
return None
|
323
|
+
|
324
|
+
except KeyError:
|
325
|
+
self._logger.error(
|
326
|
+
f"resume file at {resume_file} did not store a run_id"
|
327
|
+
)
|
328
|
+
return None
|
329
|
+
|
330
|
+
def _save_autoresume_run_id(
|
331
|
+
self,
|
332
|
+
*,
|
333
|
+
resume_file: pathlib.Path,
|
334
|
+
run_id: str,
|
335
|
+
) -> None:
|
336
|
+
"""Write the run ID to the auto-resume file."""
|
337
|
+
resume_file.parent.mkdir(exist_ok=True)
|
338
|
+
with resume_file.open("w") as f:
|
339
|
+
json.dump({"run_id": run_id}, f)
|
340
|
+
|
341
|
+
def set_run_id(self, settings: Settings) -> None:
|
342
|
+
"""Set the run ID and possibly save it to the auto-resume file.
|
343
|
+
|
344
|
+
After this, `settings.run_id` is guaranteed to be set.
|
345
|
+
|
346
|
+
Args:
|
347
|
+
settings: The run's settings derived from the environment
|
348
|
+
and explicit values passed to `wandb.init()`.
|
349
|
+
"""
|
350
|
+
if settings.resume == "auto" and settings.resume_fname:
|
351
|
+
resume_path = pathlib.Path(settings.resume_fname)
|
352
|
+
else:
|
353
|
+
resume_path = None
|
354
|
+
|
355
|
+
if resume_path:
|
356
|
+
previous_id = self._load_autoresume_run_id(resume_path)
|
357
|
+
|
358
|
+
if not previous_id:
|
359
|
+
pass
|
360
|
+
elif settings.run_id is None:
|
361
|
+
self._logger.info(f"loaded run ID from {resume_path}")
|
362
|
+
settings.run_id = previous_id
|
363
|
+
elif settings.run_id != previous_id:
|
364
|
+
wandb.termwarn(
|
365
|
+
f"Ignoring ID {previous_id} loaded due to resume='auto'"
|
366
|
+
f" because the run ID is set to {settings.run_id}.",
|
367
|
+
)
|
368
|
+
|
369
|
+
# If no run ID was inferred, explicitly set, or loaded from an
|
370
|
+
# auto-resume file, then we generate a new ID.
|
371
|
+
if settings.run_id is None:
|
372
|
+
settings.run_id = runid.generate_id()
|
373
|
+
|
374
|
+
if resume_path:
|
375
|
+
self._save_autoresume_run_id(
|
376
|
+
resume_file=resume_path,
|
377
|
+
run_id=settings.run_id,
|
378
|
+
)
|
379
|
+
|
380
|
+
def make_run_config(
|
381
|
+
self,
|
382
|
+
settings: Settings,
|
383
|
+
config: dict | str | None = None,
|
384
|
+
config_exclude_keys: list[str] | None = None,
|
385
|
+
config_include_keys: list[str] | None = None,
|
386
|
+
) -> _ConfigParts:
|
387
|
+
"""Construct the run's config.
|
388
|
+
|
389
|
+
Args:
|
390
|
+
settings: The run's finalized settings.
|
391
|
+
config: The config passed to `init()`.
|
392
|
+
config_exclude_keys: Deprecated. Keys to filter out from `config`.
|
393
|
+
config_include_keys: Deprecated. Keys to include from `config`.
|
394
|
+
|
395
|
+
Returns:
|
396
|
+
Initial values for the run's config.
|
397
|
+
"""
|
248
398
|
# TODO: remove this once officially deprecated
|
249
399
|
if config_exclude_keys:
|
250
400
|
self.deprecated_features_used["config_exclude_keys"] = (
|
@@ -260,123 +410,77 @@ class _WandbInit:
|
|
260
410
|
exclude=config_exclude_keys,
|
261
411
|
)
|
262
412
|
|
263
|
-
|
264
|
-
|
265
|
-
|
413
|
+
result = _ConfigParts(
|
414
|
+
base_no_artifacts=dict(),
|
415
|
+
sweep_no_artifacts=dict(),
|
416
|
+
launch_no_artifacts=dict(),
|
417
|
+
artifacts=dict(),
|
418
|
+
)
|
266
419
|
|
267
420
|
if not settings.sagemaker_disable and sagemaker.is_using_sagemaker():
|
268
421
|
sagemaker_config = sagemaker.parse_sm_config()
|
269
|
-
self._split_artifacts_from_config(
|
270
|
-
|
271
|
-
|
272
|
-
|
422
|
+
self._split_artifacts_from_config(
|
423
|
+
sagemaker_config,
|
424
|
+
config_target=result.base_no_artifacts,
|
425
|
+
artifacts=result.artifacts,
|
426
|
+
)
|
427
|
+
self._telemetry.feature.sagemaker = True
|
273
428
|
|
274
429
|
if self._wl._config:
|
275
|
-
self._split_artifacts_from_config(
|
430
|
+
self._split_artifacts_from_config(
|
431
|
+
self._wl._config,
|
432
|
+
config_target=result.base_no_artifacts,
|
433
|
+
artifacts=result.artifacts,
|
434
|
+
)
|
276
435
|
|
277
436
|
if config and isinstance(config, dict):
|
278
|
-
self._split_artifacts_from_config(
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
if sweep_config:
|
283
|
-
self._split_artifacts_from_config(sweep_config, self.sweep_config)
|
284
|
-
|
285
|
-
if monitor_gym and len(wandb.patched["gym"]) == 0:
|
286
|
-
wandb.gym.monitor() # type: ignore
|
287
|
-
|
288
|
-
if wandb.patched["tensorboard"]:
|
289
|
-
with telemetry.context(obj=self._init_telemetry_obj) as tel:
|
290
|
-
tel.feature.tensorboard_patch = True
|
291
|
-
|
292
|
-
if settings.sync_tensorboard:
|
293
|
-
if len(wandb.patched["tensorboard"]) == 0:
|
294
|
-
wandb.tensorboard.patch() # type: ignore
|
295
|
-
with telemetry.context(obj=self._init_telemetry_obj) as tel:
|
296
|
-
tel.feature.tensorboard_sync = True
|
297
|
-
|
298
|
-
if not settings._offline and not settings._noop:
|
299
|
-
wandb_login._login(
|
300
|
-
anonymous=settings.anonymous,
|
301
|
-
force=settings.force,
|
302
|
-
_disable_warning=True,
|
303
|
-
_silent=settings.quiet or settings.silent,
|
304
|
-
_entity=settings.entity,
|
437
|
+
self._split_artifacts_from_config(
|
438
|
+
config,
|
439
|
+
config_target=result.base_no_artifacts,
|
440
|
+
artifacts=result.artifacts,
|
305
441
|
)
|
306
442
|
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
"base_url": self._wl.settings.base_url,
|
314
|
-
"force": self._wl.settings.force,
|
315
|
-
"login_timeout": self._wl.settings.login_timeout,
|
316
|
-
}.items()
|
317
|
-
if v is not None
|
318
|
-
}
|
319
|
-
if login_settings:
|
320
|
-
settings.update_from_dict(login_settings)
|
321
|
-
|
322
|
-
# handle custom resume logic
|
323
|
-
settings.handle_resume_logic()
|
324
|
-
|
325
|
-
# get status of code saving before applying user settings
|
326
|
-
save_code_pre_user_settings = settings.save_code
|
327
|
-
if not settings._offline and not settings._noop:
|
328
|
-
user_settings = self._wl._load_user_settings()
|
329
|
-
if user_settings is not None:
|
330
|
-
settings.update_from_dict(user_settings)
|
331
|
-
|
332
|
-
# ensure that user settings don't set saving to true
|
333
|
-
# if user explicitly set these to false in UI
|
334
|
-
if save_code_pre_user_settings is False:
|
335
|
-
settings.save_code = False
|
336
|
-
|
337
|
-
# TODO: remove this once we refactor the client. This is a temporary
|
338
|
-
# fix to make sure that we use the same project name for wandb-core.
|
339
|
-
# The reason this is not going through the settings object is to
|
340
|
-
# avoid failure cases in other parts of the code that will be
|
341
|
-
# removed with the switch to wandb-core.
|
342
|
-
if settings.project is None:
|
343
|
-
settings.project = wandb.util.auto_project_name(settings.program)
|
344
|
-
|
345
|
-
settings.x_start_time = time.time()
|
346
|
-
|
347
|
-
if not settings._noop:
|
348
|
-
self._log_setup(settings)
|
443
|
+
if self._wl._sweep_config:
|
444
|
+
self._split_artifacts_from_config(
|
445
|
+
self._wl._sweep_config,
|
446
|
+
config_target=result.sweep_no_artifacts,
|
447
|
+
artifacts=result.artifacts,
|
448
|
+
)
|
349
449
|
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
450
|
+
if launch_config := _handle_launch_config(settings):
|
451
|
+
self._split_artifacts_from_config(
|
452
|
+
launch_config,
|
453
|
+
config_target=result.launch_no_artifacts,
|
454
|
+
artifacts=result.artifacts,
|
455
|
+
)
|
355
456
|
|
356
|
-
|
457
|
+
return result
|
357
458
|
|
358
459
|
def teardown(self) -> None:
|
359
460
|
# TODO: currently this is only called on failed wandb.init attempts
|
360
461
|
# normally this happens on the run object
|
361
|
-
|
362
|
-
logger.info("tearing down wandb.init")
|
462
|
+
self._logger.info("tearing down wandb.init")
|
363
463
|
for hook in self._teardown_hooks:
|
364
464
|
hook.call()
|
365
465
|
|
366
466
|
def _split_artifacts_from_config(
|
367
|
-
self,
|
467
|
+
self,
|
468
|
+
config_source: dict,
|
469
|
+
config_target: dict,
|
470
|
+
artifacts: dict,
|
368
471
|
) -> None:
|
369
472
|
for k, v in config_source.items():
|
370
473
|
if _is_artifact_representation(v):
|
371
|
-
|
474
|
+
artifacts[k] = v
|
372
475
|
else:
|
373
476
|
config_target.setdefault(k, v)
|
374
477
|
|
375
|
-
def
|
376
|
-
"""
|
478
|
+
def _create_logger(self, log_fname: str) -> logging.Logger:
|
479
|
+
"""Returns a logger configured to write to a file.
|
377
480
|
|
378
|
-
This adds a run_id to the log, in case of multiple processes on the same
|
379
|
-
Currently, there is no way to disable logging after it's
|
481
|
+
This adds a run_id to the log, in case of multiple processes on the same
|
482
|
+
machine. Currently, there is no way to disable logging after it's
|
483
|
+
enabled.
|
380
484
|
"""
|
381
485
|
handler = logging.FileHandler(log_fname)
|
382
486
|
handler.setLevel(logging.INFO)
|
@@ -387,7 +491,8 @@ class _WandbInit:
|
|
387
491
|
)
|
388
492
|
|
389
493
|
handler.setFormatter(formatter)
|
390
|
-
|
494
|
+
|
495
|
+
logger = logging.getLogger("wandb")
|
391
496
|
logger.propagate = False
|
392
497
|
logger.addHandler(handler)
|
393
498
|
# TODO: make me configurable
|
@@ -399,6 +504,8 @@ class _WandbInit:
|
|
399
504
|
)
|
400
505
|
)
|
401
506
|
|
507
|
+
return logger
|
508
|
+
|
402
509
|
def _safe_symlink(
|
403
510
|
self, base: str, target: str, name: str, delete: bool = False
|
404
511
|
) -> None:
|
@@ -429,14 +536,14 @@ class _WandbInit:
|
|
429
536
|
if self.notebook.save_ipynb(): # type: ignore
|
430
537
|
assert self.run is not None
|
431
538
|
res = self.run.log_code(root=None)
|
432
|
-
|
539
|
+
self._logger.info("saved code: %s", res) # type: ignore
|
433
540
|
if self.backend.interface is not None:
|
434
|
-
|
541
|
+
self._logger.info("pausing backend") # type: ignore
|
435
542
|
self.backend.interface.publish_pause()
|
436
543
|
|
437
544
|
def _resume_backend(self, *args: Any, **kwargs: Any) -> None: # noqa
|
438
545
|
if self.backend is not None and self.backend.interface is not None:
|
439
|
-
|
546
|
+
self._logger.info("resuming backend") # type: ignore
|
440
547
|
self.backend.interface.publish_resume()
|
441
548
|
|
442
549
|
def _jupyter_teardown(self) -> None:
|
@@ -447,8 +554,8 @@ class _WandbInit:
|
|
447
554
|
if self.notebook.save_ipynb():
|
448
555
|
assert self.run is not None
|
449
556
|
res = self.run.log_code(root=None)
|
450
|
-
|
451
|
-
|
557
|
+
self._logger.info("saved code and history: %s", res) # type: ignore
|
558
|
+
self._logger.info("cleaning up jupyter logic") # type: ignore
|
452
559
|
# because of how we bind our methods we manually find them to unregister
|
453
560
|
for hook in ipython.events.callbacks["pre_run_cell"]:
|
454
561
|
if "_resume_backend" in hook.__name__:
|
@@ -459,14 +566,14 @@ class _WandbInit:
|
|
459
566
|
ipython.display_pub.publish = ipython.display_pub._orig_publish
|
460
567
|
del ipython.display_pub._orig_publish
|
461
568
|
|
462
|
-
def
|
569
|
+
def monkeypatch_ipython(self, settings: Settings) -> None:
|
463
570
|
"""Add hooks, and session history saving."""
|
464
571
|
self.notebook = wandb.jupyter.Notebook(settings) # type: ignore
|
465
572
|
ipython = self.notebook.shell
|
466
573
|
|
467
574
|
# Monkey patch ipython publish to capture displayed outputs
|
468
575
|
if not hasattr(ipython.display_pub, "_orig_publish"):
|
469
|
-
|
576
|
+
self._logger.info("configuring jupyter hooks %s", self) # type: ignore
|
470
577
|
ipython.display_pub._orig_publish = ipython.display_pub.publish
|
471
578
|
# Registering resume and pause hooks
|
472
579
|
|
@@ -485,7 +592,7 @@ class _WandbInit:
|
|
485
592
|
|
486
593
|
ipython.display_pub.publish = publish
|
487
594
|
|
488
|
-
def
|
595
|
+
def setup_run_log_directory(self, settings: Settings) -> None:
|
489
596
|
"""Set up logging from settings."""
|
490
597
|
filesystem.mkdir_exists_ok(os.path.dirname(settings.log_user))
|
491
598
|
filesystem.mkdir_exists_ok(os.path.dirname(settings.log_internal))
|
@@ -513,25 +620,22 @@ class _WandbInit:
|
|
513
620
|
delete=True,
|
514
621
|
)
|
515
622
|
|
516
|
-
|
517
|
-
self.
|
518
|
-
|
519
|
-
assert self._wl
|
520
|
-
assert logger
|
521
|
-
|
522
|
-
self._wl._early_logger_flush(logger)
|
523
|
-
logger.info(f"Logging user logs to {settings.log_user}")
|
524
|
-
logger.info(f"Logging internal logs to {settings.log_internal}")
|
623
|
+
self._wl._early_logger_flush(self._create_logger(settings.log_user))
|
624
|
+
self._logger.info(f"Logging user logs to {settings.log_user}")
|
625
|
+
self._logger.info(f"Logging internal logs to {settings.log_internal}")
|
525
626
|
|
526
|
-
def
|
627
|
+
def make_disabled_run(self, config: _ConfigParts) -> Run:
|
527
628
|
"""Returns a Run-like object where all methods are no-ops.
|
528
629
|
|
529
|
-
This method is used when
|
530
|
-
|
630
|
+
This method is used when the `mode` setting is set to "disabled", such as
|
631
|
+
by wandb.init(mode="disabled") or by setting the WANDB_MODE environment
|
632
|
+
variable to "disabled".
|
633
|
+
|
634
|
+
It creates a Run object that mimics the behavior of a normal Run but doesn't
|
531
635
|
communicate with the W&B servers.
|
532
636
|
|
533
|
-
The returned Run object has all expected attributes and methods, but they
|
534
|
-
no-op versions that don't perform any actual logging or communication.
|
637
|
+
The returned Run object has all expected attributes and methods, but they
|
638
|
+
are no-op versions that don't perform any actual logging or communication.
|
535
639
|
"""
|
536
640
|
run_id = runid.generate_id()
|
537
641
|
drun = Run(
|
@@ -549,8 +653,8 @@ class _WandbInit:
|
|
549
653
|
)
|
550
654
|
# config, summary, and metadata objects
|
551
655
|
drun._config = wandb.sdk.wandb_config.Config()
|
552
|
-
drun._config.update(
|
553
|
-
drun._config.update(
|
656
|
+
drun._config.update(config.sweep_no_artifacts)
|
657
|
+
drun._config.update(config.base_no_artifacts)
|
554
658
|
drun.summary = SummaryDisabled() # type: ignore
|
555
659
|
drun._Run__metadata = wandb.sdk.wandb_metadata.Metadata()
|
556
660
|
|
@@ -635,24 +739,19 @@ class _WandbInit:
|
|
635
739
|
percent_done = handle.percent_done
|
636
740
|
self.printer.progress_update(line, percent_done=percent_done)
|
637
741
|
|
638
|
-
def init(self) -> Run: # noqa: C901
|
639
|
-
|
640
|
-
raise RuntimeError("Logger not initialized")
|
641
|
-
logger.info("calling init triggers")
|
742
|
+
def init(self, settings: Settings, config: _ConfigParts) -> Run: # noqa: C901
|
743
|
+
self._logger.info("calling init triggers")
|
642
744
|
trigger.call("on_init")
|
643
745
|
|
644
|
-
assert self.settings is not None
|
645
746
|
assert self._wl is not None
|
646
747
|
|
647
|
-
|
648
|
-
f"wandb.init called with sweep_config: {
|
748
|
+
self._logger.info(
|
749
|
+
f"wandb.init called with sweep_config: {config.sweep_no_artifacts}"
|
750
|
+
f"\nconfig: {config.base_no_artifacts}"
|
649
751
|
)
|
650
752
|
|
651
|
-
if self.settings._noop:
|
652
|
-
return self._make_run_disabled()
|
653
753
|
if (
|
654
|
-
|
655
|
-
or (self.settings._jupyter and self.settings.reinit is not False)
|
754
|
+
settings.reinit or (settings._jupyter and settings.reinit is not False)
|
656
755
|
) and len(self._wl._global_run_stack) > 0:
|
657
756
|
if len(self._wl._global_run_stack) > 1:
|
658
757
|
wandb.termwarn(
|
@@ -663,63 +762,66 @@ class _WandbInit:
|
|
663
762
|
)
|
664
763
|
|
665
764
|
latest_run = self._wl._global_run_stack[-1]
|
666
|
-
|
765
|
+
self._logger.info(f"found existing run on stack: {latest_run.id}")
|
667
766
|
latest_run.finish()
|
668
767
|
elif wandb.run is not None and os.getpid() == wandb.run._init_pid:
|
669
|
-
|
768
|
+
self._logger.info("wandb.init() called when a run is still active")
|
769
|
+
|
770
|
+
# NOTE: Updates telemetry on the pre-existing run.
|
670
771
|
with telemetry.context() as tel:
|
671
772
|
tel.feature.init_return_run = True
|
773
|
+
|
672
774
|
return wandb.run
|
673
775
|
|
674
|
-
|
776
|
+
self._logger.info("starting backend")
|
675
777
|
|
676
|
-
if not
|
778
|
+
if not settings.x_disable_service:
|
677
779
|
service = self._wl.ensure_service()
|
678
|
-
|
780
|
+
self._logger.info("sending inform_init request")
|
679
781
|
service.inform_init(
|
680
|
-
settings=
|
681
|
-
run_id=
|
782
|
+
settings=settings.to_proto(),
|
783
|
+
run_id=settings.run_id, # type: ignore
|
682
784
|
)
|
683
785
|
else:
|
684
786
|
service = None
|
685
787
|
|
686
788
|
mailbox = Mailbox()
|
687
789
|
backend = Backend(
|
688
|
-
settings=
|
790
|
+
settings=settings,
|
689
791
|
service=service,
|
690
792
|
mailbox=mailbox,
|
691
793
|
)
|
692
794
|
backend.ensure_launched()
|
693
|
-
|
795
|
+
self._logger.info("backend started and connected")
|
694
796
|
|
695
797
|
# resuming needs access to the server, check server_status()?
|
696
798
|
run = Run(
|
697
|
-
config=
|
698
|
-
settings=
|
699
|
-
sweep_config=
|
700
|
-
launch_config=
|
799
|
+
config=config.base_no_artifacts,
|
800
|
+
settings=settings,
|
801
|
+
sweep_config=config.sweep_no_artifacts,
|
802
|
+
launch_config=config.launch_no_artifacts,
|
701
803
|
)
|
702
804
|
|
703
805
|
# Populate initial telemetry
|
704
|
-
with telemetry.context(run=run, obj=self.
|
806
|
+
with telemetry.context(run=run, obj=self._telemetry) as tel:
|
705
807
|
tel.cli_version = wandb.__version__
|
706
808
|
tel.python_version = platform.python_version()
|
707
809
|
tel.platform = f"{platform.system()}-{platform.machine()}".lower()
|
708
810
|
hf_version = _huggingface_version()
|
709
811
|
if hf_version:
|
710
812
|
tel.huggingface_version = hf_version
|
711
|
-
if
|
813
|
+
if settings._jupyter:
|
712
814
|
tel.env.jupyter = True
|
713
|
-
if
|
815
|
+
if settings._ipython:
|
714
816
|
tel.env.ipython = True
|
715
|
-
if
|
817
|
+
if settings._colab:
|
716
818
|
tel.env.colab = True
|
717
|
-
if
|
819
|
+
if settings._kaggle:
|
718
820
|
tel.env.kaggle = True
|
719
|
-
if
|
821
|
+
if settings._windows:
|
720
822
|
tel.env.windows = True
|
721
823
|
|
722
|
-
if
|
824
|
+
if settings.launch:
|
723
825
|
tel.feature.launch = True
|
724
826
|
|
725
827
|
for module_name in telemetry.list_telemetry_imports(only_imported=True):
|
@@ -727,8 +829,8 @@ class _WandbInit:
|
|
727
829
|
|
728
830
|
# probe the active start method
|
729
831
|
active_start_method: str | None = None
|
730
|
-
if
|
731
|
-
active_start_method =
|
832
|
+
if settings.start_method == "thread":
|
833
|
+
active_start_method = settings.start_method
|
732
834
|
else:
|
733
835
|
active_start_method = getattr(
|
734
836
|
backend._multiprocessing, "get_start_method", lambda: None
|
@@ -746,7 +848,7 @@ class _WandbInit:
|
|
746
848
|
if os.environ.get("PEX"):
|
747
849
|
tel.env.pex = True
|
748
850
|
|
749
|
-
if
|
851
|
+
if settings._aws_lambda:
|
750
852
|
tel.env.aws_lambda = True
|
751
853
|
|
752
854
|
if os.environ.get(wandb.env._DISABLE_SERVICE):
|
@@ -754,13 +856,13 @@ class _WandbInit:
|
|
754
856
|
|
755
857
|
if service:
|
756
858
|
tel.feature.service = True
|
757
|
-
if
|
859
|
+
if settings.x_flow_control_disabled:
|
758
860
|
tel.feature.flow_control_disabled = True
|
759
|
-
if
|
861
|
+
if settings.x_flow_control_custom:
|
760
862
|
tel.feature.flow_control_custom = True
|
761
|
-
if not
|
863
|
+
if not settings.x_require_legacy_service:
|
762
864
|
tel.feature.core = True
|
763
|
-
if
|
865
|
+
if settings._shared:
|
764
866
|
wandb.termwarn(
|
765
867
|
"The `_shared` feature is experimental and may change. "
|
766
868
|
"Please contact support@wandb.com for guidance and to report any issues."
|
@@ -769,7 +871,7 @@ class _WandbInit:
|
|
769
871
|
|
770
872
|
tel.env.maybe_mp = _maybe_mp_process(backend)
|
771
873
|
|
772
|
-
if not
|
874
|
+
if not settings.label_disable:
|
773
875
|
if self.notebook:
|
774
876
|
run._label_probe_notebook(self.notebook)
|
775
877
|
else:
|
@@ -783,13 +885,12 @@ class _WandbInit:
|
|
783
885
|
run=run,
|
784
886
|
)
|
785
887
|
|
786
|
-
|
888
|
+
self._logger.info("updated telemetry")
|
787
889
|
|
788
890
|
run._set_library(self._wl)
|
789
891
|
run._set_backend(backend)
|
790
892
|
run._set_teardown_hooks(self._teardown_hooks)
|
791
893
|
|
792
|
-
backend._hack_set_run(run)
|
793
894
|
assert backend.interface
|
794
895
|
mailbox.enable_keepalive()
|
795
896
|
backend.interface.publish_header()
|
@@ -797,25 +898,23 @@ class _WandbInit:
|
|
797
898
|
# Using GitRepo() blocks & can be slow, depending on user's current git setup.
|
798
899
|
# We don't want to block run initialization/start request, so populate run's git
|
799
900
|
# info beforehand.
|
800
|
-
if not (
|
901
|
+
if not (settings.disable_git or settings.x_disable_machine_info):
|
801
902
|
run._populate_git_info()
|
802
903
|
|
803
904
|
run_result: pb.RunUpdateResult | None = None
|
804
905
|
|
805
|
-
if
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
wandb.termwarn(
|
811
|
-
"`resume` will be ignored since W&B syncing is set to `offline`. "
|
812
|
-
f"Starting a new run with run id {run.id}."
|
813
|
-
)
|
906
|
+
if settings._offline and settings.resume:
|
907
|
+
wandb.termwarn(
|
908
|
+
"`resume` will be ignored since W&B syncing is set to `offline`. "
|
909
|
+
f"Starting a new run with run id {run.id}."
|
910
|
+
)
|
814
911
|
error: wandb.Error | None = None
|
815
912
|
|
816
|
-
timeout =
|
913
|
+
timeout = settings.init_timeout
|
817
914
|
|
818
|
-
|
915
|
+
self._logger.info(
|
916
|
+
f"communicating run to backend with {timeout} second timeout",
|
917
|
+
)
|
819
918
|
|
820
919
|
run_init_handle = backend.interface.deliver_run(run)
|
821
920
|
result = run_init_handle.wait(
|
@@ -842,7 +941,7 @@ class _WandbInit:
|
|
842
941
|
error = ProtobufErrorHandler.to_exception(run_result.error)
|
843
942
|
|
844
943
|
if error is not None:
|
845
|
-
|
944
|
+
self._logger.error(f"encountered error: {error}")
|
846
945
|
if not service:
|
847
946
|
# Shutdown the backend and get rid of the logger
|
848
947
|
# we don't need to do console cleanup at this point
|
@@ -860,19 +959,19 @@ class _WandbInit:
|
|
860
959
|
)
|
861
960
|
|
862
961
|
if run_result.run.resumed:
|
863
|
-
|
962
|
+
self._logger.info("run resumed")
|
864
963
|
with telemetry.context(run=run) as tel:
|
865
964
|
tel.feature.resumed = run_result.run.resumed
|
866
965
|
run._set_run_obj(run_result.run)
|
867
966
|
|
868
|
-
|
967
|
+
self._logger.info("starting run threads in backend")
|
869
968
|
# initiate run (stats and metadata probing)
|
870
969
|
|
871
970
|
if service:
|
872
|
-
assert
|
971
|
+
assert settings.run_id
|
873
972
|
service.inform_start(
|
874
|
-
settings=
|
875
|
-
run_id=
|
973
|
+
settings=settings.to_proto(),
|
974
|
+
run_id=settings.run_id,
|
876
975
|
)
|
877
976
|
|
878
977
|
assert backend.interface
|
@@ -889,15 +988,15 @@ class _WandbInit:
|
|
889
988
|
|
890
989
|
run._handle_launch_artifact_overrides()
|
891
990
|
if (
|
892
|
-
|
893
|
-
and
|
894
|
-
and os.path.exists(
|
991
|
+
settings.launch
|
992
|
+
and settings.launch_config_path
|
993
|
+
and os.path.exists(settings.launch_config_path)
|
895
994
|
):
|
896
|
-
run.save(
|
995
|
+
run.save(settings.launch_config_path)
|
897
996
|
# put artifacts in run config here
|
898
997
|
# since doing so earlier will cause an error
|
899
998
|
# as the run is not upserted
|
900
|
-
for k, v in
|
999
|
+
for k, v in config.artifacts.items():
|
901
1000
|
run.config.update({k: v}, allow_val_change=True)
|
902
1001
|
job_artifact = run._launch_artifact_mapping.get(
|
903
1002
|
wandb.util.LAUNCH_JOB_ARTIFACT_SLOT_NAME
|
@@ -907,7 +1006,7 @@ class _WandbInit:
|
|
907
1006
|
|
908
1007
|
self.backend = backend
|
909
1008
|
run._on_start()
|
910
|
-
|
1009
|
+
self._logger.info("run started, returning control to user process")
|
911
1010
|
return run
|
912
1011
|
|
913
1012
|
|
@@ -938,10 +1037,7 @@ def _attach(
|
|
938
1037
|
wandb._assert_is_user_process() # type: ignore
|
939
1038
|
|
940
1039
|
_wl = wandb.setup()
|
941
|
-
|
942
|
-
_set_logger(_wl._get_logger())
|
943
|
-
if logger is None:
|
944
|
-
raise UsageError("logger is not initialized")
|
1040
|
+
logger = _wl._get_logger()
|
945
1041
|
|
946
1042
|
service = _wl.ensure_service()
|
947
1043
|
|
@@ -972,7 +1068,6 @@ def _attach(
|
|
972
1068
|
run._init(settings=settings)
|
973
1069
|
run._set_library(_wl)
|
974
1070
|
run._set_backend(backend)
|
975
|
-
backend._hack_set_run(run)
|
976
1071
|
assert backend.interface
|
977
1072
|
|
978
1073
|
mailbox.enable_keepalive()
|
@@ -992,6 +1087,26 @@ def _attach(
|
|
992
1087
|
return run
|
993
1088
|
|
994
1089
|
|
1090
|
+
def _monkeypatch_openai_gym() -> None:
|
1091
|
+
"""Patch OpenAI gym to log to the global `wandb.run`."""
|
1092
|
+
if len(wandb.patched["gym"]) > 0:
|
1093
|
+
return
|
1094
|
+
|
1095
|
+
from wandb.integration import gym
|
1096
|
+
|
1097
|
+
gym.monitor()
|
1098
|
+
|
1099
|
+
|
1100
|
+
def _monkeypatch_tensorboard() -> None:
|
1101
|
+
"""Patch TensorBoard to log to the global `wandb.run`."""
|
1102
|
+
if len(wandb.patched["tensorboard"]) > 0:
|
1103
|
+
return
|
1104
|
+
|
1105
|
+
from wandb.integration import tensorboard as tb_module
|
1106
|
+
|
1107
|
+
tb_module.patch()
|
1108
|
+
|
1109
|
+
|
995
1110
|
def init( # noqa: C901
|
996
1111
|
entity: str | None = None,
|
997
1112
|
project: str | None = None,
|
@@ -1229,6 +1344,8 @@ def init( # noqa: C901
|
|
1229
1344
|
"""
|
1230
1345
|
wandb._assert_is_user_process() # type: ignore
|
1231
1346
|
|
1347
|
+
init_telemetry = telemetry.TelemetryRecord()
|
1348
|
+
|
1232
1349
|
init_settings = Settings()
|
1233
1350
|
if isinstance(settings, dict):
|
1234
1351
|
init_settings = Settings(**settings)
|
@@ -1276,27 +1393,65 @@ def init( # noqa: C901
|
|
1276
1393
|
if resume_from is not None:
|
1277
1394
|
init_settings.resume_from = resume_from # type: ignore
|
1278
1395
|
|
1396
|
+
if config is not None:
|
1397
|
+
init_telemetry.feature.set_init_config = True
|
1398
|
+
|
1399
|
+
wl: wandb_setup._WandbSetup | None = None
|
1400
|
+
|
1279
1401
|
try:
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1402
|
+
wl = wandb.setup()
|
1403
|
+
|
1404
|
+
wi = _WandbInit(wl, init_telemetry)
|
1405
|
+
|
1406
|
+
wi.maybe_login(init_settings)
|
1407
|
+
run_settings = wi.make_run_settings(init_settings)
|
1408
|
+
|
1409
|
+
if run_settings.run_id is not None:
|
1410
|
+
init_telemetry.feature.set_init_id = True
|
1411
|
+
if run_settings.run_name is not None:
|
1412
|
+
init_telemetry.feature.set_init_name = True
|
1413
|
+
if run_settings.run_tags is not None:
|
1414
|
+
init_telemetry.feature.set_init_tags = True
|
1415
|
+
if run_settings._offline:
|
1416
|
+
init_telemetry.feature.offline = True
|
1417
|
+
|
1418
|
+
wi.set_run_id(run_settings)
|
1419
|
+
|
1420
|
+
run_config = wi.make_run_config(
|
1421
|
+
settings=run_settings,
|
1283
1422
|
config=config,
|
1284
1423
|
config_exclude_keys=config_exclude_keys,
|
1285
1424
|
config_include_keys=config_include_keys,
|
1286
|
-
allow_val_change=allow_val_change,
|
1287
|
-
monitor_gym=monitor_gym,
|
1288
1425
|
)
|
1289
|
-
|
1426
|
+
|
1427
|
+
if run_settings._noop:
|
1428
|
+
return wi.make_disabled_run(run_config)
|
1429
|
+
|
1430
|
+
wi.setup_run_log_directory(run_settings)
|
1431
|
+
if run_settings._jupyter:
|
1432
|
+
wi.monkeypatch_ipython(run_settings)
|
1433
|
+
|
1434
|
+
if monitor_gym:
|
1435
|
+
_monkeypatch_openai_gym()
|
1436
|
+
|
1437
|
+
if wandb.patched["tensorboard"]:
|
1438
|
+
# NOTE: The user may have called the patch function directly.
|
1439
|
+
init_telemetry.feature.tensorboard_patch = True
|
1440
|
+
if run_settings.sync_tensorboard:
|
1441
|
+
_monkeypatch_tensorboard()
|
1442
|
+
init_telemetry.feature.tensorboard_sync = True
|
1443
|
+
|
1444
|
+
return wi.init(run_settings, run_config)
|
1290
1445
|
|
1291
1446
|
except KeyboardInterrupt as e:
|
1292
|
-
if
|
1293
|
-
|
1447
|
+
if wl:
|
1448
|
+
wl._get_logger().warning("interrupted", exc_info=e)
|
1294
1449
|
|
1295
1450
|
raise
|
1296
1451
|
|
1297
1452
|
except Exception as e:
|
1298
|
-
if
|
1299
|
-
|
1453
|
+
if wl:
|
1454
|
+
wl._get_logger().exception("error in wandb.init()", exc_info=e)
|
1300
1455
|
|
1301
1456
|
# Need to build delay into this sentry capture because our exit hooks
|
1302
1457
|
# mess with sentry's ability to send out errors before the program ends.
|