wandb 0.19.4rc1__py3-none-win_amd64.whl → 0.19.6rc4__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +1 -1
- wandb/__init__.pyi +1 -8
- wandb/_iterutils.py +46 -0
- wandb/apis/internal.py +4 -0
- wandb/apis/normalize.py +13 -5
- wandb/bin/gpu_stats.exe +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/cli.py +9 -2
- wandb/proto/v3/wandb_internal_pb2.py +36 -36
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_internal_pb2.py +36 -36
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_internal_pb2.py +36 -36
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/sdk/artifacts/artifact.py +120 -8
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +12 -5
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -1
- wandb/sdk/backend/backend.py +7 -11
- wandb/sdk/data_types/base_types/wb_value.py +10 -10
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +2 -2
- wandb/sdk/data_types/helper_types/image_mask.py +2 -2
- wandb/sdk/data_types/image.py +0 -3
- wandb/sdk/data_types/saved_model.py +1 -1
- wandb/sdk/data_types/utils.py +2 -6
- wandb/sdk/interface/interface.py +26 -12
- wandb/sdk/interface/interface_sock.py +7 -11
- wandb/sdk/internal/internal_api.py +9 -1
- wandb/sdk/internal/sender.py +2 -2
- wandb/sdk/internal/system/assets/cpu.py +1 -1
- wandb/sdk/lib/apikey.py +7 -19
- wandb/sdk/lib/mailbox.py +0 -14
- wandb/sdk/lib/retry.py +6 -3
- wandb/sdk/lib/run_moment.py +19 -7
- wandb/sdk/lib/server.py +20 -0
- wandb/sdk/lib/service_connection.py +2 -2
- wandb/sdk/wandb_init.py +71 -46
- wandb/sdk/wandb_login.py +86 -110
- wandb/sdk/wandb_metadata.py +60 -31
- wandb/sdk/wandb_run.py +32 -45
- wandb/sdk/wandb_settings.py +465 -143
- wandb/sdk/wandb_setup.py +10 -22
- wandb/util.py +44 -12
- {wandb-0.19.4rc1.dist-info → wandb-0.19.6rc4.dist-info}/METADATA +1 -1
- {wandb-0.19.4rc1.dist-info → wandb-0.19.6rc4.dist-info}/RECORD +47 -46
- {wandb-0.19.4rc1.dist-info → wandb-0.19.6rc4.dist-info}/WHEEL +0 -0
- {wandb-0.19.4rc1.dist-info → wandb-0.19.6rc4.dist-info}/entry_points.txt +0 -0
- {wandb-0.19.4rc1.dist-info → wandb-0.19.6rc4.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/wandb_settings.py
CHANGED
@@ -23,6 +23,7 @@ else:
|
|
23
23
|
|
24
24
|
from google.protobuf.wrappers_pb2 import BoolValue, DoubleValue, Int32Value, StringValue
|
25
25
|
from pydantic import (
|
26
|
+
AliasChoices,
|
26
27
|
BaseModel,
|
27
28
|
ConfigDict,
|
28
29
|
Field,
|
@@ -48,300 +49,597 @@ def _path_convert(*args: str) -> str:
|
|
48
49
|
|
49
50
|
|
50
51
|
class Settings(BaseModel, validate_assignment=True):
|
51
|
-
"""Settings for the W&B SDK.
|
52
|
-
|
53
|
-
|
52
|
+
"""Settings for the W&B SDK.
|
53
|
+
|
54
|
+
This class manages configuration settings for the W&B SDK,
|
55
|
+
ensuring type safety and validation of all settings. Settings are accessible
|
56
|
+
as attributes and can be initialized programmatically, through environment
|
57
|
+
variables (WANDB_ prefix), and via configuration files.
|
58
|
+
|
59
|
+
The settings are organized into three categories:
|
60
|
+
1. Public settings: Core configuration options that users can safely modify to customize
|
61
|
+
W&B's behavior for their specific needs.
|
62
|
+
2. Internal settings: Settings prefixed with 'x_' that handle low-level SDK behavior.
|
63
|
+
These settings are primarily for internal use and debugging. While they can be modified,
|
64
|
+
they are not considered part of the public API and may change without notice in future
|
65
|
+
versions.
|
66
|
+
3. Computed settings: Read-only settings that are automatically derived from other settings or
|
67
|
+
the environment.
|
68
|
+
"""
|
69
|
+
|
70
|
+
# Pydantic Model configuration.
|
54
71
|
model_config = ConfigDict(
|
55
72
|
extra="forbid", # throw an error if extra fields are provided
|
56
|
-
|
73
|
+
validate_default=True, # validate default values
|
74
|
+
use_attribute_docstrings=True, # for field descriptions
|
75
|
+
revalidate_instances="always",
|
57
76
|
)
|
58
77
|
|
59
78
|
# Public settings.
|
60
79
|
|
61
|
-
# Flag to allow table artifacts to be synced in offline mode.
|
62
|
-
#
|
63
|
-
# To revert to the old behavior, set this to False.
|
64
80
|
allow_offline_artifacts: bool = True
|
81
|
+
"""Flag to allow table artifacts to be synced in offline mode.
|
82
|
+
|
83
|
+
To revert to the old behavior, set this to False.
|
84
|
+
"""
|
85
|
+
|
65
86
|
allow_val_change: bool = False
|
66
|
-
|
67
|
-
|
68
|
-
# tracking the run, so you don't accidentally create an anonymous
|
69
|
-
# run.
|
70
|
-
# - "allow": lets a logged-in user track runs with their account, but
|
71
|
-
# lets someone who is running the script without a W&B account see
|
72
|
-
# the charts in the UI.
|
73
|
-
# - "must": sends the run to an anonymous account instead of to a
|
74
|
-
# signed-up user account.
|
87
|
+
"""Flag to allow modification of `Config` values after they've been set."""
|
88
|
+
|
75
89
|
anonymous: Literal["allow", "must", "never"] | None = None
|
76
|
-
|
90
|
+
"""Controls anonymous data logging.
|
91
|
+
|
92
|
+
Possible values are:
|
93
|
+
- "never": requires you to link your W&B account before
|
94
|
+
tracking the run, so you don't accidentally create an anonymous
|
95
|
+
run.
|
96
|
+
- "allow": lets a logged-in user track runs with their account, but
|
97
|
+
lets someone who is running the script without a W&B account see
|
98
|
+
the charts in the UI.
|
99
|
+
- "must": sends the run to an anonymous account instead of to a
|
100
|
+
signed-up user account.
|
101
|
+
"""
|
102
|
+
|
77
103
|
api_key: str | None = None
|
104
|
+
"""The W&B API key."""
|
105
|
+
|
78
106
|
azure_account_url_to_access_key: dict[str, str] | None = None
|
79
|
-
|
107
|
+
"""Mapping of Azure account URLs to their corresponding access keys for Azure integration."""
|
108
|
+
|
80
109
|
base_url: str = "https://api.wandb.ai"
|
110
|
+
"""The URL of the W&B backend for data synchronization."""
|
111
|
+
|
81
112
|
code_dir: str | None = None
|
113
|
+
"""Directory containing the code to be tracked by W&B."""
|
114
|
+
|
82
115
|
config_paths: Sequence[str] | None = None
|
83
|
-
|
84
|
-
|
85
|
-
# system environment and settings.
|
86
|
-
#
|
87
|
-
# "off" - Disables console capture.
|
88
|
-
#
|
89
|
-
# "redirect" - Redirects low-level file descriptors for capturing output.
|
90
|
-
#
|
91
|
-
# "wrap" - Overrides the write methods of sys.stdout/sys.stderr. Will be
|
92
|
-
# mapped to either "wrap_raw" or "wrap_emu" based on the state of the system.
|
93
|
-
#
|
94
|
-
# "wrap_raw" - Same as "wrap" but captures raw output directly instead of
|
95
|
-
# through an emulator.
|
96
|
-
#
|
97
|
-
# "wrap_emu" - Same as "wrap" but captures output through an emulator.
|
116
|
+
"""Paths to files to load configuration from into the `Config` object."""
|
117
|
+
|
98
118
|
console: Literal["auto", "off", "wrap", "redirect", "wrap_raw", "wrap_emu"] = Field(
|
99
119
|
default="auto",
|
100
120
|
validate_default=True,
|
101
121
|
)
|
102
|
-
|
122
|
+
"""The type of console capture to be applied.
|
123
|
+
|
124
|
+
Possible values are:
|
125
|
+
"auto" - Automatically selects the console capture method based on the
|
126
|
+
system environment and settings.
|
127
|
+
|
128
|
+
"off" - Disables console capture.
|
129
|
+
|
130
|
+
"redirect" - Redirects low-level file descriptors for capturing output.
|
131
|
+
|
132
|
+
"wrap" - Overrides the write methods of sys.stdout/sys.stderr. Will be
|
133
|
+
mapped to either "wrap_raw" or "wrap_emu" based on the state of the system.
|
134
|
+
|
135
|
+
"wrap_raw" - Same as "wrap" but captures raw output directly instead of
|
136
|
+
through an emulator. Derived from the `wrap` setting and should not be set manually.
|
137
|
+
|
138
|
+
"wrap_emu" - Same as "wrap" but captures output through an emulator.
|
139
|
+
Derived from the `wrap` setting and should not be set manually.
|
140
|
+
"""
|
141
|
+
|
103
142
|
console_multipart: bool = False
|
104
|
-
|
143
|
+
"""Whether to produce multipart console log files."""
|
144
|
+
|
105
145
|
credentials_file: str = Field(
|
106
146
|
default_factory=lambda: str(credentials.DEFAULT_WANDB_CREDENTIALS_FILE)
|
107
147
|
)
|
108
|
-
|
148
|
+
"""Path to file for writing temporary access tokens."""
|
149
|
+
|
109
150
|
disable_code: bool = False
|
110
|
-
|
151
|
+
"""Whether to disable capturing the code."""
|
152
|
+
|
111
153
|
disable_git: bool = False
|
112
|
-
|
154
|
+
"""Whether to disable capturing the git state."""
|
155
|
+
|
113
156
|
disable_job_creation: bool = True
|
114
|
-
|
157
|
+
"""Whether to disable the creation of a job artifact for W&B Launch."""
|
158
|
+
|
115
159
|
docker: str | None = None
|
116
|
-
|
160
|
+
"""The Docker image used to execute the script."""
|
161
|
+
|
117
162
|
email: str | None = None
|
118
|
-
|
163
|
+
"""The email address of the user."""
|
164
|
+
|
119
165
|
entity: str | None = None
|
166
|
+
"""The W&B entity, such as a user or a team."""
|
167
|
+
|
120
168
|
force: bool = False
|
169
|
+
"""Whether to pass the `force` flag to `wandb.login()`."""
|
170
|
+
|
121
171
|
fork_from: RunMoment | None = None
|
172
|
+
"""Specifies a point in a previous execution of a run to fork from.
|
173
|
+
|
174
|
+
The point is defined by the run ID, a metric, and its value.
|
175
|
+
Currently, only the metric '_step' is supported.
|
176
|
+
"""
|
177
|
+
|
122
178
|
git_commit: str | None = None
|
179
|
+
"""The git commit hash to associate with the run."""
|
180
|
+
|
123
181
|
git_remote: str = "origin"
|
182
|
+
"""The git remote to associate with the run."""
|
183
|
+
|
124
184
|
git_remote_url: str | None = None
|
185
|
+
"""The URL of the git remote repository."""
|
186
|
+
|
125
187
|
git_root: str | None = None
|
188
|
+
"""Root directory of the git repository."""
|
189
|
+
|
126
190
|
heartbeat_seconds: int = 30
|
191
|
+
"""Interval in seconds between heartbeat signals sent to the W&B servers."""
|
192
|
+
|
127
193
|
host: str | None = None
|
128
|
-
|
194
|
+
"""Hostname of the machine running the script."""
|
195
|
+
|
129
196
|
http_proxy: str | None = None
|
130
|
-
|
197
|
+
"""Custom proxy servers for http requests to W&B."""
|
198
|
+
|
131
199
|
https_proxy: str | None = None
|
200
|
+
"""Custom proxy servers for https requests to W&B."""
|
201
|
+
|
132
202
|
# Path to file containing an identity token (JWT) for authentication.
|
133
203
|
identity_token_file: str | None = None
|
134
|
-
|
204
|
+
"""Path to file containing an identity token (JWT) for authentication."""
|
205
|
+
|
135
206
|
ignore_globs: tuple[str, ...] = ()
|
136
|
-
|
207
|
+
"""Unix glob patterns relative to `files_dir` specifying files to exclude from upload."""
|
208
|
+
|
137
209
|
init_timeout: float = 90.0
|
138
|
-
|
210
|
+
"""Time in seconds to wait for the `wandb.init` call to complete before timing out."""
|
211
|
+
|
139
212
|
insecure_disable_ssl: bool = False
|
213
|
+
"""Whether to insecurely disable SSL verification."""
|
214
|
+
|
140
215
|
job_name: str | None = None
|
216
|
+
"""Name of the Launch job running the script."""
|
217
|
+
|
141
218
|
job_source: Literal["repo", "artifact", "image"] | None = None
|
219
|
+
"""Source type for Launch."""
|
220
|
+
|
142
221
|
label_disable: bool = False
|
222
|
+
"""Whether to disable automatic labeling features."""
|
223
|
+
|
143
224
|
launch: bool = False
|
225
|
+
"""Flag to indicate if the run is being launched through W&B Launch."""
|
226
|
+
|
144
227
|
launch_config_path: str | None = None
|
228
|
+
"""Path to the launch configuration file."""
|
229
|
+
|
145
230
|
login_timeout: float | None = None
|
231
|
+
"""Time in seconds to wait for login operations before timing out."""
|
232
|
+
|
146
233
|
mode: Literal["online", "offline", "dryrun", "disabled", "run", "shared"] = Field(
|
147
234
|
default="online",
|
148
235
|
validate_default=True,
|
149
236
|
)
|
237
|
+
"""The operating mode for W&B logging and synchronization."""
|
238
|
+
|
150
239
|
notebook_name: str | None = None
|
151
|
-
|
240
|
+
"""Name of the notebook if running in a Jupyter-like environment."""
|
241
|
+
|
152
242
|
program: str | None = None
|
153
|
-
|
154
|
-
|
155
|
-
#
|
156
|
-
# Root repository directory is defined as the directory containing the
|
157
|
-
# .git directory, if it exists. Otherwise, it's the current working directory.
|
243
|
+
"""Path to the script that created the run, if available."""
|
244
|
+
|
158
245
|
program_abspath: str | None = None
|
246
|
+
"""The absolute path from the root repository directory to the script that
|
247
|
+
created the run.
|
248
|
+
|
249
|
+
Root repository directory is defined as the directory containing the
|
250
|
+
.git directory, if it exists. Otherwise, it's the current working directory.
|
251
|
+
"""
|
252
|
+
|
159
253
|
program_relpath: str | None = None
|
160
|
-
|
254
|
+
"""The relative path to the script that created the run."""
|
255
|
+
|
161
256
|
project: str | None = None
|
257
|
+
"""The W&B project ID."""
|
258
|
+
|
162
259
|
quiet: bool = False
|
260
|
+
"""Flag to suppress non-essential output."""
|
261
|
+
|
163
262
|
reinit: bool = False
|
263
|
+
"""Flag to allow reinitialization of a run.
|
264
|
+
|
265
|
+
If not set, when an active run exists, calling `wandb.init()` returns the existing run
|
266
|
+
instead of creating a new one.
|
267
|
+
"""
|
268
|
+
|
164
269
|
relogin: bool = False
|
165
|
-
|
166
|
-
|
167
|
-
# "must": Resumes from an existing run with the same ID. If no such run exists,
|
168
|
-
# it will result in failure.
|
169
|
-
#
|
170
|
-
# "allow": Attempts to resume from an existing run with the same ID. If none is
|
171
|
-
# found, a new run will be created.
|
172
|
-
#
|
173
|
-
# "never": Always starts a new run. If a run with the same ID already exists,
|
174
|
-
# it will result in failure.
|
175
|
-
#
|
176
|
-
# "auto": Automatically resumes from the most recent failed run on the same
|
177
|
-
# machine.
|
270
|
+
"""Flag to force a new login attempt."""
|
271
|
+
|
178
272
|
resume: Literal["allow", "must", "never", "auto"] | None = None
|
273
|
+
"""Specifies the resume behavior for the run.
|
274
|
+
|
275
|
+
The available options are:
|
276
|
+
|
277
|
+
"must": Resumes from an existing run with the same ID. If no such run exists,
|
278
|
+
it will result in failure.
|
279
|
+
|
280
|
+
"allow": Attempts to resume from an existing run with the same ID. If none is
|
281
|
+
found, a new run will be created.
|
282
|
+
|
283
|
+
"never": Always starts a new run. If a run with the same ID already exists,
|
284
|
+
it will result in failure.
|
285
|
+
|
286
|
+
"auto": Automatically resumes from the most recent failed run on the same
|
287
|
+
machine.
|
288
|
+
"""
|
289
|
+
|
179
290
|
resume_from: RunMoment | None = None
|
180
|
-
|
181
|
-
|
182
|
-
|
291
|
+
"""Specifies a point in a previous execution of a run to resume from.
|
292
|
+
|
293
|
+
The point is defined by the run ID, a metric, and its value.
|
294
|
+
Currently, only the metric '_step' is supported.
|
295
|
+
"""
|
296
|
+
|
183
297
|
resumed: bool = False
|
184
|
-
|
185
|
-
|
298
|
+
"""Indication from the server about the state of the run.
|
299
|
+
|
300
|
+
This is different from resume, a user provided flag.
|
301
|
+
"""
|
302
|
+
|
186
303
|
root_dir: str = Field(default_factory=lambda: os.path.abspath(os.getcwd()))
|
304
|
+
"""The root directory to use as the base for all run-related paths.
|
305
|
+
|
306
|
+
In particular, this is used to derive the wandb directory and the run directory.
|
307
|
+
"""
|
308
|
+
|
187
309
|
run_group: str | None = None
|
188
|
-
|
310
|
+
"""Group identifier for related runs.
|
311
|
+
|
312
|
+
Used for grouping runs in the UI.
|
313
|
+
"""
|
314
|
+
|
189
315
|
run_id: str | None = None
|
316
|
+
"""The ID of the run."""
|
317
|
+
|
190
318
|
run_job_type: str | None = None
|
319
|
+
"""Type of job being run (e.g., training, evaluation)."""
|
320
|
+
|
191
321
|
run_name: str | None = None
|
322
|
+
"""Human-readable name for the run."""
|
323
|
+
|
192
324
|
run_notes: str | None = None
|
325
|
+
"""Additional notes or description for the run."""
|
326
|
+
|
193
327
|
run_tags: tuple[str, ...] | None = None
|
328
|
+
"""Tags to associate with the run for organization and filtering."""
|
329
|
+
|
194
330
|
sagemaker_disable: bool = False
|
331
|
+
"""Flag to disable SageMaker-specific functionality."""
|
332
|
+
|
195
333
|
save_code: bool | None = None
|
334
|
+
"""Whether to save the code associated with the run."""
|
335
|
+
|
196
336
|
settings_system: str = Field(
|
197
337
|
default_factory=lambda: _path_convert(
|
198
338
|
os.path.join("~", ".config", "wandb", "settings")
|
199
339
|
)
|
200
340
|
)
|
341
|
+
"""Path to the system-wide settings file."""
|
342
|
+
|
201
343
|
show_colors: bool | None = None
|
344
|
+
"""Whether to use colored output in the console."""
|
345
|
+
|
202
346
|
show_emoji: bool | None = None
|
347
|
+
"""Whether to show emoji in the console output."""
|
348
|
+
|
203
349
|
show_errors: bool = True
|
350
|
+
"""Whether to display error messages."""
|
351
|
+
|
204
352
|
show_info: bool = True
|
353
|
+
"""Whether to display informational messages."""
|
354
|
+
|
205
355
|
show_warnings: bool = True
|
356
|
+
"""Whether to display warning messages."""
|
357
|
+
|
206
358
|
silent: bool = False
|
359
|
+
"""Flag to suppress all output."""
|
360
|
+
|
207
361
|
start_method: str | None = None
|
362
|
+
"""Method to use for starting subprocesses."""
|
363
|
+
|
208
364
|
strict: bool | None = None
|
365
|
+
"""Whether to enable strict mode for validation and error checking."""
|
366
|
+
|
209
367
|
summary_timeout: int = 60
|
368
|
+
"""Time in seconds to wait for summary operations before timing out."""
|
369
|
+
|
210
370
|
summary_warnings: int = 5 # TODO: kill this with fire
|
371
|
+
"""Maximum number of summary warnings to display."""
|
372
|
+
|
211
373
|
sweep_id: str | None = None
|
374
|
+
"""Identifier of the sweep this run belongs to."""
|
375
|
+
|
212
376
|
sweep_param_path: str | None = None
|
377
|
+
"""Path to the sweep parameters configuration."""
|
378
|
+
|
213
379
|
symlink: bool = Field(
|
214
380
|
default_factory=lambda: False if platform.system() == "Windows" else True
|
215
381
|
)
|
382
|
+
"""Whether to use symlinks (True by default except on Windows)."""
|
383
|
+
|
216
384
|
sync_tensorboard: bool | None = None
|
385
|
+
"""Whether to synchronize TensorBoard logs with W&B."""
|
386
|
+
|
217
387
|
table_raise_on_max_row_limit_exceeded: bool = False
|
388
|
+
"""Whether to raise an exception when table row limits are exceeded."""
|
389
|
+
|
218
390
|
username: str | None = None
|
391
|
+
"""Username."""
|
219
392
|
|
220
393
|
# Internal settings.
|
221
394
|
#
|
222
395
|
# These are typically not meant to be set by the user and should not be considered
|
223
396
|
# a part of the public API as they may change or be removed in future versions.
|
224
397
|
|
225
|
-
# CLI mode.
|
226
398
|
x_cli_only_mode: bool = False
|
227
|
-
|
399
|
+
"""Flag to indicate that the SDK is running in CLI-only mode."""
|
400
|
+
|
228
401
|
x_disable_meta: bool = False
|
229
|
-
|
230
|
-
|
231
|
-
# TODO: this is deprecated and will be removed in future versions.
|
402
|
+
"""Flag to disable the collection of system metadata."""
|
403
|
+
|
232
404
|
x_disable_service: bool = False
|
233
|
-
|
405
|
+
"""Flag to disable the W&B service.
|
406
|
+
|
407
|
+
This is deprecated and will be removed in future versions."""
|
408
|
+
|
234
409
|
x_disable_setproctitle: bool = False
|
235
|
-
|
410
|
+
"""Flag to disable using setproctitle for the internal process in the legacy service.
|
411
|
+
|
412
|
+
This is deprecated and will be removed in future versions.
|
413
|
+
"""
|
414
|
+
|
236
415
|
x_disable_stats: bool = False
|
237
|
-
|
238
|
-
|
239
|
-
# Prevent early viewer query.
|
416
|
+
"""Flag to disable the collection of system metrics."""
|
417
|
+
|
240
418
|
x_disable_viewer: bool = False
|
241
|
-
|
419
|
+
"""Flag to disable the early viewer query."""
|
420
|
+
|
242
421
|
x_disable_machine_info: bool = False
|
243
|
-
|
422
|
+
"""Flag to disable automatic machine info collection."""
|
423
|
+
|
244
424
|
x_executable: str | None = None
|
245
|
-
|
425
|
+
"""Path to the Python executable."""
|
426
|
+
|
246
427
|
x_extra_http_headers: dict[str, str] | None = None
|
247
|
-
|
248
|
-
|
249
|
-
# This applies when wandb-core is enabled. Its purpose is to prevent
|
250
|
-
# HTTP requests from failing due to containing too much data.
|
251
|
-
#
|
252
|
-
# This number is approximate: requests will be slightly larger.
|
428
|
+
"""Additional headers to add to all outgoing HTTP requests."""
|
429
|
+
|
253
430
|
x_file_stream_max_bytes: int | None = None
|
254
|
-
|
431
|
+
"""An approximate maximum request size for the filestream API.
|
432
|
+
|
433
|
+
Its purpose is to prevent HTTP requests from failing due to
|
434
|
+
containing too much data. This number is approximate:
|
435
|
+
requests will be slightly larger.
|
436
|
+
"""
|
437
|
+
|
255
438
|
x_file_stream_max_line_bytes: int | None = None
|
256
|
-
|
439
|
+
"""Maximum line length for filestream JSONL files."""
|
440
|
+
|
257
441
|
x_file_stream_transmit_interval: float | None = None
|
442
|
+
"""Interval in seconds between filestream transmissions."""
|
443
|
+
|
258
444
|
# Filestream retry client configuration.
|
259
|
-
|
445
|
+
|
260
446
|
x_file_stream_retry_max: int | None = None
|
261
|
-
|
447
|
+
"""Max number of retries for filestream operations."""
|
448
|
+
|
262
449
|
x_file_stream_retry_wait_min_seconds: float | None = None
|
263
|
-
|
450
|
+
"""Minimum wait time between retries for filestream operations."""
|
451
|
+
|
264
452
|
x_file_stream_retry_wait_max_seconds: float | None = None
|
265
|
-
|
453
|
+
"""Maximum wait time between retries for filestream operations."""
|
454
|
+
|
266
455
|
x_file_stream_timeout_seconds: float | None = None
|
456
|
+
"""Timeout in seconds for individual filestream HTTP requests."""
|
457
|
+
|
267
458
|
# file transfer retry client configuration
|
459
|
+
|
268
460
|
x_file_transfer_retry_max: int | None = None
|
461
|
+
"""Max number of retries for file transfer operations."""
|
462
|
+
|
269
463
|
x_file_transfer_retry_wait_min_seconds: float | None = None
|
464
|
+
"""Minimum wait time between retries for file transfer operations."""
|
465
|
+
|
270
466
|
x_file_transfer_retry_wait_max_seconds: float | None = None
|
467
|
+
"""Maximum wait time between retries for file transfer operations."""
|
468
|
+
|
271
469
|
x_file_transfer_timeout_seconds: float | None = None
|
272
|
-
|
470
|
+
"""Timeout in seconds for individual file transfer HTTP requests."""
|
471
|
+
|
273
472
|
x_files_dir: str | None = None
|
274
|
-
|
473
|
+
"""Override setting for the computed files_dir.."""
|
474
|
+
|
275
475
|
x_flow_control_custom: bool | None = None
|
476
|
+
"""Flag indicating custom flow control for filestream.
|
477
|
+
|
478
|
+
TODO: Not implemented in wandb-core.
|
479
|
+
"""
|
480
|
+
|
276
481
|
x_flow_control_disabled: bool | None = None
|
482
|
+
"""Flag indicating flow control is disabled for filestream.
|
483
|
+
|
484
|
+
TODO: Not implemented in wandb-core.
|
485
|
+
"""
|
486
|
+
|
277
487
|
# graphql retry client configuration
|
488
|
+
|
278
489
|
x_graphql_retry_max: int | None = None
|
490
|
+
"""Max number of retries for GraphQL operations."""
|
491
|
+
|
279
492
|
x_graphql_retry_wait_min_seconds: float | None = None
|
493
|
+
"""Minimum wait time between retries for GraphQL operations."""
|
494
|
+
|
280
495
|
x_graphql_retry_wait_max_seconds: float | None = None
|
496
|
+
"""Maximum wait time between retries for GraphQL operations."""
|
497
|
+
|
281
498
|
x_graphql_timeout_seconds: float | None = None
|
499
|
+
"""Timeout in seconds for individual GraphQL requests."""
|
500
|
+
|
282
501
|
x_internal_check_process: float = 8.0
|
502
|
+
"""Interval for internal process health checks in seconds."""
|
503
|
+
|
283
504
|
x_jupyter_name: str | None = None
|
505
|
+
"""Name of the Jupyter notebook."""
|
506
|
+
|
284
507
|
x_jupyter_path: str | None = None
|
508
|
+
"""Path to the Jupyter notebook."""
|
509
|
+
|
285
510
|
x_jupyter_root: str | None = None
|
286
|
-
|
287
|
-
|
288
|
-
# nodes in a distributed training job.
|
511
|
+
"""Root directory of the Jupyter notebook."""
|
512
|
+
|
289
513
|
x_label: str | None = None
|
514
|
+
"""Label to assign to system metrics and console logs collected for the run.
|
515
|
+
|
516
|
+
This is used to group data by on the frontend and can be used to distinguish data
|
517
|
+
from different processes in a distributed training job.
|
518
|
+
"""
|
519
|
+
|
290
520
|
x_live_policy_rate_limit: int | None = None
|
521
|
+
"""Rate limit for live policy updates in seconds."""
|
522
|
+
|
291
523
|
x_live_policy_wait_time: int | None = None
|
524
|
+
"""Wait time between live policy updates in seconds."""
|
525
|
+
|
292
526
|
x_log_level: int = logging.INFO
|
527
|
+
"""Logging level for internal operations."""
|
528
|
+
|
293
529
|
x_network_buffer: int | None = None
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
530
|
+
"""Size of the network buffer used in flow control.
|
531
|
+
|
532
|
+
TODO: Not implemented in wandb-core.
|
533
|
+
"""
|
534
|
+
|
535
|
+
x_primary: bool = Field(
|
536
|
+
default=True, validation_alias=AliasChoices("x_primary", "x_primary_node")
|
537
|
+
)
|
538
|
+
"""Determines whether to save internal wandb files and metadata.
|
539
|
+
|
540
|
+
In a distributed setting, this is useful for avoiding file overwrites
|
541
|
+
from secondary processes when only system metrics and logs are needed,
|
542
|
+
as the primary process handles the main logging.
|
543
|
+
"""
|
544
|
+
|
300
545
|
x_proxies: dict[str, str] | None = None
|
546
|
+
"""Custom proxy servers for requests to W&B.
|
547
|
+
|
548
|
+
This is deprecated and will be removed in future versions.
|
549
|
+
Please use `http_proxy` and `https_proxy` instead.
|
550
|
+
"""
|
551
|
+
|
301
552
|
x_runqueue_item_id: str | None = None
|
553
|
+
"""ID of the Launch run queue item being processed."""
|
554
|
+
|
302
555
|
x_require_legacy_service: bool = False
|
556
|
+
"""Force the use of legacy wandb service."""
|
557
|
+
|
303
558
|
x_save_requirements: bool = True
|
559
|
+
"""Flag to save the requirements file."""
|
560
|
+
|
304
561
|
x_service_transport: str | None = None
|
562
|
+
"""Transport method for communication with the wandb service."""
|
563
|
+
|
305
564
|
x_service_wait: float = 30.0
|
565
|
+
"""Time in seconds to wait for the wandb-core internal service to start."""
|
566
|
+
|
306
567
|
x_show_operation_stats: bool = True
|
307
|
-
|
568
|
+
"""Whether to show statistics about internal operations such as data uploads."""
|
569
|
+
|
308
570
|
x_start_time: float | None = None
|
309
|
-
|
571
|
+
"""The start time of the run in seconds since the Unix epoch."""
|
572
|
+
|
310
573
|
x_stats_pid: int = os.getpid()
|
311
|
-
|
574
|
+
"""PID of the process that started the wandb-core process to collect system stats for."""
|
575
|
+
|
312
576
|
x_stats_sampling_interval: float = Field(default=10.0)
|
313
|
-
|
314
|
-
|
577
|
+
"""Sampling interval for the system monitor in seconds."""
|
578
|
+
|
315
579
|
x_stats_neuron_monitor_config_path: str | None = None
|
316
|
-
|
580
|
+
"""Path to the default config file for the neuron-monitor tool.
|
581
|
+
|
582
|
+
This is used to monitor AWS Trainium devices.
|
583
|
+
"""
|
584
|
+
|
585
|
+
x_stats_dcgm_exporter: str | None = None
|
586
|
+
"""Endpoint to extract Nvidia DCGM metrics from.
|
587
|
+
|
588
|
+
Two options are supported:
|
589
|
+
- Extract DCGM-related metrics from a query to the Prometheus `/api/v1/query` endpoint.
|
590
|
+
It is a common practice to aggregate metrics reported by the instances of the DCGM Exporter
|
591
|
+
running on different nodes in a cluster using Prometheus.
|
592
|
+
- TODO: Parse metrics directly from the `/metrics` endpoint of the DCGM Exporter.
|
593
|
+
|
594
|
+
Examples:
|
595
|
+
- `http://localhost:9400/api/v1/query?query=DCGM_FI_DEV_GPU_TEMP{node="l1337", cluster="globular"}`.
|
596
|
+
- TODO: `http://192.168.0.1:9400/metrics`.
|
597
|
+
"""
|
598
|
+
|
317
599
|
x_stats_open_metrics_endpoints: dict[str, str] | None = None
|
318
|
-
|
319
|
-
|
320
|
-
# - {"metric regex pattern, including endpoint name as prefix": {"label": "label value regex pattern"}}
|
321
|
-
# - ("metric regex pattern 1", "metric regex pattern 2", ...)
|
600
|
+
"""OpenMetrics `/metrics` endpoints to monitor for system metrics."""
|
601
|
+
|
322
602
|
x_stats_open_metrics_filters: dict[str, dict[str, str]] | Sequence[str] | None = (
|
323
603
|
None
|
324
604
|
)
|
325
|
-
|
605
|
+
"""Filter to apply to metrics collected from OpenMetrics `/metrics` endpoints.
|
606
|
+
|
607
|
+
Supports two formats:
|
608
|
+
- {"metric regex pattern, including endpoint name as prefix": {"label": "label value regex pattern"}}
|
609
|
+
- ("metric regex pattern 1", "metric regex pattern 2", ...)
|
610
|
+
"""
|
611
|
+
|
326
612
|
x_stats_open_metrics_http_headers: dict[str, str] | None = None
|
327
|
-
|
613
|
+
"""HTTP headers to add to OpenMetrics requests."""
|
614
|
+
|
328
615
|
x_stats_disk_paths: Sequence[str] | None = Field(
|
329
616
|
default_factory=lambda: ("/", "/System/Volumes/Data")
|
330
617
|
if platform.system() == "Darwin"
|
331
618
|
else ("/",)
|
332
619
|
)
|
333
|
-
|
334
|
-
|
335
|
-
# Assumes 0-based indexing matching CUDA/ROCm device enumeration.
|
620
|
+
"""System paths to monitor for disk usage."""
|
621
|
+
|
336
622
|
x_stats_gpu_device_ids: Sequence[int] | None = None
|
337
|
-
|
338
|
-
|
623
|
+
"""GPU device indices to monitor.
|
624
|
+
|
625
|
+
If not set, captures metrics for all GPUs.
|
626
|
+
Assumes 0-based indexing matching CUDA/ROCm device enumeration.
|
627
|
+
"""
|
628
|
+
|
339
629
|
x_stats_buffer_size: int = 0
|
340
|
-
|
630
|
+
"""Number of system metric samples to buffer in memory in the wandb-core process.
|
631
|
+
|
632
|
+
Can be accessed via run._system_metrics.
|
633
|
+
"""
|
634
|
+
|
341
635
|
x_sync: bool = False
|
342
|
-
|
343
|
-
|
636
|
+
"""Flag to indicate whether we are syncing a run from the transaction log."""
|
637
|
+
|
344
638
|
x_update_finish_state: bool = True
|
639
|
+
"""Flag to indicate whether this process can update the run's final state on the server.
|
640
|
+
|
641
|
+
Set to False in distributed training when only the main process should determine the final state.
|
642
|
+
"""
|
345
643
|
|
346
644
|
# Model validator to catch legacy settings.
|
347
645
|
@model_validator(mode="before")
|
@@ -465,7 +763,11 @@ class Settings(BaseModel, validate_assignment=True):
|
|
465
763
|
@classmethod
|
466
764
|
def validate_fork_from(cls, value, info) -> RunMoment | None:
|
467
765
|
run_moment = cls._runmoment_preprocessor(value)
|
468
|
-
if
|
766
|
+
if (
|
767
|
+
run_moment
|
768
|
+
and info.data.get("run_id") is not None
|
769
|
+
and info.data.get("run_id") == run_moment.run
|
770
|
+
):
|
469
771
|
raise ValueError(
|
470
772
|
"Provided `run_id` is the same as the run to `fork_from`. "
|
471
773
|
"Please provide a different `run_id` or remove the `run_id` argument. "
|
@@ -548,7 +850,11 @@ class Settings(BaseModel, validate_assignment=True):
|
|
548
850
|
@classmethod
|
549
851
|
def validate_resume_from(cls, value, info) -> RunMoment | None:
|
550
852
|
run_moment = cls._runmoment_preprocessor(value)
|
551
|
-
if
|
853
|
+
if (
|
854
|
+
run_moment
|
855
|
+
and info.data.get("run_id") is not None
|
856
|
+
and info.data.get("run_id") != run_moment.run
|
857
|
+
):
|
552
858
|
raise ValueError(
|
553
859
|
"Both `run_id` and `resume_from` have been specified with different ids."
|
554
860
|
)
|
@@ -1025,7 +1331,8 @@ class Settings(BaseModel, validate_assignment=True):
|
|
1025
1331
|
):
|
1026
1332
|
self.save_code = env.should_save_code()
|
1027
1333
|
|
1028
|
-
|
1334
|
+
if os.getenv(env.DISABLE_GIT) is not None:
|
1335
|
+
self.disable_git = env.disable_git()
|
1029
1336
|
|
1030
1337
|
# Attempt to get notebook information if not already set by the user
|
1031
1338
|
if self._jupyter and (self.notebook_name is None or self.notebook_name == ""):
|
@@ -1047,8 +1354,8 @@ class Settings(BaseModel, validate_assignment=True):
|
|
1047
1354
|
f"couldn't find {self.notebook_name}.",
|
1048
1355
|
)
|
1049
1356
|
|
1050
|
-
# host
|
1051
|
-
# vars exist -- but if they don't, we'll fill them in here
|
1357
|
+
# host is populated by update_from_env_vars if the corresponding env
|
1358
|
+
# vars exist -- but if they don't, we'll fill them in here.
|
1052
1359
|
if self.host is None:
|
1053
1360
|
self.host = socket.gethostname() # type: ignore
|
1054
1361
|
|
@@ -1071,8 +1378,15 @@ class Settings(BaseModel, validate_assignment=True):
|
|
1071
1378
|
program = self.program or self._get_program()
|
1072
1379
|
|
1073
1380
|
if program is not None:
|
1074
|
-
|
1075
|
-
|
1381
|
+
try:
|
1382
|
+
root = (
|
1383
|
+
GitRepo().root or os.getcwd()
|
1384
|
+
if not self.disable_git
|
1385
|
+
else os.getcwd()
|
1386
|
+
)
|
1387
|
+
except Exception:
|
1388
|
+
# if the git command fails, fall back to the current working directory
|
1389
|
+
root = os.getcwd()
|
1076
1390
|
|
1077
1391
|
self.program_relpath = self.program_relpath or self._get_program_relpath(
|
1078
1392
|
program, root
|
@@ -1119,6 +1433,22 @@ class Settings(BaseModel, validate_assignment=True):
|
|
1119
1433
|
raise TypeError(f"Unsupported type {type(v)} for setting {k}")
|
1120
1434
|
continue
|
1121
1435
|
|
1436
|
+
# special case for RunMoment fields
|
1437
|
+
if k in ("fork_from", "resume_from"):
|
1438
|
+
run_moment = RunMoment(
|
1439
|
+
run=v.get("run"),
|
1440
|
+
value=v.get("value"),
|
1441
|
+
metric=v.get("metric"),
|
1442
|
+
)
|
1443
|
+
getattr(settings_proto, k).CopyFrom(
|
1444
|
+
wandb_settings_pb2.RunMoment(
|
1445
|
+
run=run_moment.run,
|
1446
|
+
value=run_moment.value,
|
1447
|
+
metric=run_moment.metric,
|
1448
|
+
)
|
1449
|
+
)
|
1450
|
+
continue
|
1451
|
+
|
1122
1452
|
if isinstance(v, bool):
|
1123
1453
|
getattr(settings_proto, k).CopyFrom(BoolValue(value=v))
|
1124
1454
|
elif isinstance(v, int):
|
@@ -1136,14 +1466,6 @@ class Settings(BaseModel, validate_assignment=True):
|
|
1136
1466
|
for key, value in v.items():
|
1137
1467
|
# we only support dicts with string values for now
|
1138
1468
|
mapping.value[key] = value
|
1139
|
-
elif isinstance(v, RunMoment):
|
1140
|
-
getattr(settings_proto, k).CopyFrom(
|
1141
|
-
wandb_settings_pb2.RunMoment(
|
1142
|
-
run=v.run,
|
1143
|
-
value=v.value,
|
1144
|
-
metric=v.metric,
|
1145
|
-
)
|
1146
|
-
)
|
1147
1469
|
elif v is None:
|
1148
1470
|
# None means that the setting value was not set.
|
1149
1471
|
pass
|