wandb 0.19.5__py3-none-win32.whl → 0.19.6rc4__py3-none-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,7 @@ else:
23
23
 
24
24
  from google.protobuf.wrappers_pb2 import BoolValue, DoubleValue, Int32Value, StringValue
25
25
  from pydantic import (
26
+ AliasChoices,
26
27
  BaseModel,
27
28
  ConfigDict,
28
29
  Field,
@@ -48,300 +49,597 @@ def _path_convert(*args: str) -> str:
48
49
 
49
50
 
50
51
  class Settings(BaseModel, validate_assignment=True):
51
- """Settings for the W&B SDK."""
52
-
53
- # Pydantic configuration.
52
+ """Settings for the W&B SDK.
53
+
54
+ This class manages configuration settings for the W&B SDK,
55
+ ensuring type safety and validation of all settings. Settings are accessible
56
+ as attributes and can be initialized programmatically, through environment
57
+ variables (WANDB_ prefix), and via configuration files.
58
+
59
+ The settings are organized into three categories:
60
+ 1. Public settings: Core configuration options that users can safely modify to customize
61
+ W&B's behavior for their specific needs.
62
+ 2. Internal settings: Settings prefixed with 'x_' that handle low-level SDK behavior.
63
+ These settings are primarily for internal use and debugging. While they can be modified,
64
+ they are not considered part of the public API and may change without notice in future
65
+ versions.
66
+ 3. Computed settings: Read-only settings that are automatically derived from other settings or
67
+ the environment.
68
+ """
69
+
70
+ # Pydantic Model configuration.
54
71
  model_config = ConfigDict(
55
72
  extra="forbid", # throw an error if extra fields are provided
56
- # validate_default=True, # validate default values
73
+ validate_default=True, # validate default values
74
+ use_attribute_docstrings=True, # for field descriptions
75
+ revalidate_instances="always",
57
76
  )
58
77
 
59
78
  # Public settings.
60
79
 
61
- # Flag to allow table artifacts to be synced in offline mode.
62
- #
63
- # To revert to the old behavior, set this to False.
64
80
  allow_offline_artifacts: bool = True
81
+ """Flag to allow table artifacts to be synced in offline mode.
82
+
83
+ To revert to the old behavior, set this to False.
84
+ """
85
+
65
86
  allow_val_change: bool = False
66
- # Controls anonymous data logging. Possible values are:
67
- # - "never": requires you to link your W&B account before
68
- # tracking the run, so you don't accidentally create an anonymous
69
- # run.
70
- # - "allow": lets a logged-in user track runs with their account, but
71
- # lets someone who is running the script without a W&B account see
72
- # the charts in the UI.
73
- # - "must": sends the run to an anonymous account instead of to a
74
- # signed-up user account.
87
+ """Flag to allow modification of `Config` values after they've been set."""
88
+
75
89
  anonymous: Literal["allow", "must", "never"] | None = None
76
- # The W&B API key.
90
+ """Controls anonymous data logging.
91
+
92
+ Possible values are:
93
+ - "never": requires you to link your W&B account before
94
+ tracking the run, so you don't accidentally create an anonymous
95
+ run.
96
+ - "allow": lets a logged-in user track runs with their account, but
97
+ lets someone who is running the script without a W&B account see
98
+ the charts in the UI.
99
+ - "must": sends the run to an anonymous account instead of to a
100
+ signed-up user account.
101
+ """
102
+
77
103
  api_key: str | None = None
104
+ """The W&B API key."""
105
+
78
106
  azure_account_url_to_access_key: dict[str, str] | None = None
79
- # The URL of the W&B backend, used for GraphQL and filestream operations.
107
+ """Mapping of Azure account URLs to their corresponding access keys for Azure integration."""
108
+
80
109
  base_url: str = "https://api.wandb.ai"
110
+ """The URL of the W&B backend for data synchronization."""
111
+
81
112
  code_dir: str | None = None
113
+ """Directory containing the code to be tracked by W&B."""
114
+
82
115
  config_paths: Sequence[str] | None = None
83
- # The type of console capture to be applied. Possible values are:
84
- # "auto" - Automatically selects the console capture method based on the
85
- # system environment and settings.
86
- #
87
- # "off" - Disables console capture.
88
- #
89
- # "redirect" - Redirects low-level file descriptors for capturing output.
90
- #
91
- # "wrap" - Overrides the write methods of sys.stdout/sys.stderr. Will be
92
- # mapped to either "wrap_raw" or "wrap_emu" based on the state of the system.
93
- #
94
- # "wrap_raw" - Same as "wrap" but captures raw output directly instead of
95
- # through an emulator.
96
- #
97
- # "wrap_emu" - Same as "wrap" but captures output through an emulator.
116
+ """Paths to files to load configuration from into the `Config` object."""
117
+
98
118
  console: Literal["auto", "off", "wrap", "redirect", "wrap_raw", "wrap_emu"] = Field(
99
119
  default="auto",
100
120
  validate_default=True,
101
121
  )
102
- # Whether to produce multipart console log files.
122
+ """The type of console capture to be applied.
123
+
124
+ Possible values are:
125
+ "auto" - Automatically selects the console capture method based on the
126
+ system environment and settings.
127
+
128
+ "off" - Disables console capture.
129
+
130
+ "redirect" - Redirects low-level file descriptors for capturing output.
131
+
132
+ "wrap" - Overrides the write methods of sys.stdout/sys.stderr. Will be
133
+ mapped to either "wrap_raw" or "wrap_emu" based on the state of the system.
134
+
135
+ "wrap_raw" - Same as "wrap" but captures raw output directly instead of
136
+ through an emulator. Derived from the `wrap` setting and should not be set manually.
137
+
138
+ "wrap_emu" - Same as "wrap" but captures output through an emulator.
139
+ Derived from the `wrap` setting and should not be set manually.
140
+ """
141
+
103
142
  console_multipart: bool = False
104
- # Path to file for writing temporary access tokens.
143
+ """Whether to produce multipart console log files."""
144
+
105
145
  credentials_file: str = Field(
106
146
  default_factory=lambda: str(credentials.DEFAULT_WANDB_CREDENTIALS_FILE)
107
147
  )
108
- # Whether to disable code saving.
148
+ """Path to file for writing temporary access tokens."""
149
+
109
150
  disable_code: bool = False
110
- # Whether to disable capturing the git state.
151
+ """Whether to disable capturing the code."""
152
+
111
153
  disable_git: bool = False
112
- # Whether to disable the creation of a job artifact for W&B Launch.
154
+ """Whether to disable capturing the git state."""
155
+
113
156
  disable_job_creation: bool = True
114
- # The Docker image used to execute the script.
157
+ """Whether to disable the creation of a job artifact for W&B Launch."""
158
+
115
159
  docker: str | None = None
116
- # The email address of the user.
160
+ """The Docker image used to execute the script."""
161
+
117
162
  email: str | None = None
118
- # The W&B entity, like a user or a team.
163
+ """The email address of the user."""
164
+
119
165
  entity: str | None = None
166
+ """The W&B entity, such as a user or a team."""
167
+
120
168
  force: bool = False
169
+ """Whether to pass the `force` flag to `wandb.login()`."""
170
+
121
171
  fork_from: RunMoment | None = None
172
+ """Specifies a point in a previous execution of a run to fork from.
173
+
174
+ The point is defined by the run ID, a metric, and its value.
175
+ Currently, only the metric '_step' is supported.
176
+ """
177
+
122
178
  git_commit: str | None = None
179
+ """The git commit hash to associate with the run."""
180
+
123
181
  git_remote: str = "origin"
182
+ """The git remote to associate with the run."""
183
+
124
184
  git_remote_url: str | None = None
185
+ """The URL of the git remote repository."""
186
+
125
187
  git_root: str | None = None
188
+ """Root directory of the git repository."""
189
+
126
190
  heartbeat_seconds: int = 30
191
+ """Interval in seconds between heartbeat signals sent to the W&B servers."""
192
+
127
193
  host: str | None = None
128
- # The custom proxy servers for http requests to W&B.
194
+ """Hostname of the machine running the script."""
195
+
129
196
  http_proxy: str | None = None
130
- # The custom proxy servers for https requests to W&B.
197
+ """Custom proxy servers for http requests to W&B."""
198
+
131
199
  https_proxy: str | None = None
200
+ """Custom proxy servers for https requests to W&B."""
201
+
132
202
  # Path to file containing an identity token (JWT) for authentication.
133
203
  identity_token_file: str | None = None
134
- # Unix glob patterns relative to `files_dir` to not upload.
204
+ """Path to file containing an identity token (JWT) for authentication."""
205
+
135
206
  ignore_globs: tuple[str, ...] = ()
136
- # Time in seconds to wait for the wandb.init call to complete before timing out.
207
+ """Unix glob patterns relative to `files_dir` specifying files to exclude from upload."""
208
+
137
209
  init_timeout: float = 90.0
138
- # Whether to insecurely disable SSL verification.
210
+ """Time in seconds to wait for the `wandb.init` call to complete before timing out."""
211
+
139
212
  insecure_disable_ssl: bool = False
213
+ """Whether to insecurely disable SSL verification."""
214
+
140
215
  job_name: str | None = None
216
+ """Name of the Launch job running the script."""
217
+
141
218
  job_source: Literal["repo", "artifact", "image"] | None = None
219
+ """Source type for Launch."""
220
+
142
221
  label_disable: bool = False
222
+ """Whether to disable automatic labeling features."""
223
+
143
224
  launch: bool = False
225
+ """Flag to indicate if the run is being launched through W&B Launch."""
226
+
144
227
  launch_config_path: str | None = None
228
+ """Path to the launch configuration file."""
229
+
145
230
  login_timeout: float | None = None
231
+ """Time in seconds to wait for login operations before timing out."""
232
+
146
233
  mode: Literal["online", "offline", "dryrun", "disabled", "run", "shared"] = Field(
147
234
  default="online",
148
235
  validate_default=True,
149
236
  )
237
+ """The operating mode for W&B logging and synchronization."""
238
+
150
239
  notebook_name: str | None = None
151
- # Path to the script that created the run, if available.
240
+ """Name of the notebook if running in a Jupyter-like environment."""
241
+
152
242
  program: str | None = None
153
- # The absolute path from the root repository directory to the script that
154
- # created the run.
155
- #
156
- # Root repository directory is defined as the directory containing the
157
- # .git directory, if it exists. Otherwise, it's the current working directory.
243
+ """Path to the script that created the run, if available."""
244
+
158
245
  program_abspath: str | None = None
246
+ """The absolute path from the root repository directory to the script that
247
+ created the run.
248
+
249
+ Root repository directory is defined as the directory containing the
250
+ .git directory, if it exists. Otherwise, it's the current working directory.
251
+ """
252
+
159
253
  program_relpath: str | None = None
160
- # The W&B project ID.
254
+ """The relative path to the script that created the run."""
255
+
161
256
  project: str | None = None
257
+ """The W&B project ID."""
258
+
162
259
  quiet: bool = False
260
+ """Flag to suppress non-essential output."""
261
+
163
262
  reinit: bool = False
263
+ """Flag to allow reinitialization of a run.
264
+
265
+ If not set, when an active run exists, calling `wandb.init()` returns the existing run
266
+ instead of creating a new one.
267
+ """
268
+
164
269
  relogin: bool = False
165
- # Specifies the resume behavior for the run. The available options are:
166
- #
167
- # "must": Resumes from an existing run with the same ID. If no such run exists,
168
- # it will result in failure.
169
- #
170
- # "allow": Attempts to resume from an existing run with the same ID. If none is
171
- # found, a new run will be created.
172
- #
173
- # "never": Always starts a new run. If a run with the same ID already exists,
174
- # it will result in failure.
175
- #
176
- # "auto": Automatically resumes from the most recent failed run on the same
177
- # machine.
270
+ """Flag to force a new login attempt."""
271
+
178
272
  resume: Literal["allow", "must", "never", "auto"] | None = None
273
+ """Specifies the resume behavior for the run.
274
+
275
+ The available options are:
276
+
277
+ "must": Resumes from an existing run with the same ID. If no such run exists,
278
+ it will result in failure.
279
+
280
+ "allow": Attempts to resume from an existing run with the same ID. If none is
281
+ found, a new run will be created.
282
+
283
+ "never": Always starts a new run. If a run with the same ID already exists,
284
+ it will result in failure.
285
+
286
+ "auto": Automatically resumes from the most recent failed run on the same
287
+ machine.
288
+ """
289
+
179
290
  resume_from: RunMoment | None = None
180
- # Indication from the server about the state of the run.
181
- #
182
- # This is different from resume, a user provided flag.
291
+ """Specifies a point in a previous execution of a run to resume from.
292
+
293
+ The point is defined by the run ID, a metric, and its value.
294
+ Currently, only the metric '_step' is supported.
295
+ """
296
+
183
297
  resumed: bool = False
184
- # The root directory that will be used to derive other paths,
185
- # such as the wandb directory, and the run directory.
298
+ """Indication from the server about the state of the run.
299
+
300
+ This is different from resume, a user provided flag.
301
+ """
302
+
186
303
  root_dir: str = Field(default_factory=lambda: os.path.abspath(os.getcwd()))
304
+ """The root directory to use as the base for all run-related paths.
305
+
306
+ In particular, this is used to derive the wandb directory and the run directory.
307
+ """
308
+
187
309
  run_group: str | None = None
188
- # The ID of the run.
310
+ """Group identifier for related runs.
311
+
312
+ Used for grouping runs in the UI.
313
+ """
314
+
189
315
  run_id: str | None = None
316
+ """The ID of the run."""
317
+
190
318
  run_job_type: str | None = None
319
+ """Type of job being run (e.g., training, evaluation)."""
320
+
191
321
  run_name: str | None = None
322
+ """Human-readable name for the run."""
323
+
192
324
  run_notes: str | None = None
325
+ """Additional notes or description for the run."""
326
+
193
327
  run_tags: tuple[str, ...] | None = None
328
+ """Tags to associate with the run for organization and filtering."""
329
+
194
330
  sagemaker_disable: bool = False
331
+ """Flag to disable SageMaker-specific functionality."""
332
+
195
333
  save_code: bool | None = None
334
+ """Whether to save the code associated with the run."""
335
+
196
336
  settings_system: str = Field(
197
337
  default_factory=lambda: _path_convert(
198
338
  os.path.join("~", ".config", "wandb", "settings")
199
339
  )
200
340
  )
341
+ """Path to the system-wide settings file."""
342
+
201
343
  show_colors: bool | None = None
344
+ """Whether to use colored output in the console."""
345
+
202
346
  show_emoji: bool | None = None
347
+ """Whether to show emoji in the console output."""
348
+
203
349
  show_errors: bool = True
350
+ """Whether to display error messages."""
351
+
204
352
  show_info: bool = True
353
+ """Whether to display informational messages."""
354
+
205
355
  show_warnings: bool = True
356
+ """Whether to display warning messages."""
357
+
206
358
  silent: bool = False
359
+ """Flag to suppress all output."""
360
+
207
361
  start_method: str | None = None
362
+ """Method to use for starting subprocesses."""
363
+
208
364
  strict: bool | None = None
365
+ """Whether to enable strict mode for validation and error checking."""
366
+
209
367
  summary_timeout: int = 60
368
+ """Time in seconds to wait for summary operations before timing out."""
369
+
210
370
  summary_warnings: int = 5 # TODO: kill this with fire
371
+ """Maximum number of summary warnings to display."""
372
+
211
373
  sweep_id: str | None = None
374
+ """Identifier of the sweep this run belongs to."""
375
+
212
376
  sweep_param_path: str | None = None
377
+ """Path to the sweep parameters configuration."""
378
+
213
379
  symlink: bool = Field(
214
380
  default_factory=lambda: False if platform.system() == "Windows" else True
215
381
  )
382
+ """Whether to use symlinks (True by default except on Windows)."""
383
+
216
384
  sync_tensorboard: bool | None = None
385
+ """Whether to synchronize TensorBoard logs with W&B."""
386
+
217
387
  table_raise_on_max_row_limit_exceeded: bool = False
388
+ """Whether to raise an exception when table row limits are exceeded."""
389
+
218
390
  username: str | None = None
391
+ """Username."""
219
392
 
220
393
  # Internal settings.
221
394
  #
222
395
  # These are typically not meant to be set by the user and should not be considered
223
396
  # a part of the public API as they may change or be removed in future versions.
224
397
 
225
- # CLI mode.
226
398
  x_cli_only_mode: bool = False
227
- # Disable the collection of system metadata.
399
+ """Flag to indicate that the SDK is running in CLI-only mode."""
400
+
228
401
  x_disable_meta: bool = False
229
- # Pre-wandb-core, this setting was used to disable the (now legacy) wandb service.
230
- #
231
- # TODO: this is deprecated and will be removed in future versions.
402
+ """Flag to disable the collection of system metadata."""
403
+
232
404
  x_disable_service: bool = False
233
- # Do not use setproctitle for internal process in legacy service.
405
+ """Flag to disable the W&B service.
406
+
407
+ This is deprecated and will be removed in future versions."""
408
+
234
409
  x_disable_setproctitle: bool = False
235
- # Disable system metrics collection.
410
+ """Flag to disable using setproctitle for the internal process in the legacy service.
411
+
412
+ This is deprecated and will be removed in future versions.
413
+ """
414
+
236
415
  x_disable_stats: bool = False
237
- # Disable check for latest version of wandb, from PyPI.
238
- x_disable_update_check: bool = False
239
- # Prevent early viewer query.
416
+ """Flag to disable the collection of system metrics."""
417
+
240
418
  x_disable_viewer: bool = False
241
- # Disable automatic machine info collection.
419
+ """Flag to disable the early viewer query."""
420
+
242
421
  x_disable_machine_info: bool = False
243
- # Python executable
422
+ """Flag to disable automatic machine info collection."""
423
+
244
424
  x_executable: str | None = None
245
- # Additional headers to add to all outgoing HTTP requests.
425
+ """Path to the Python executable."""
426
+
246
427
  x_extra_http_headers: dict[str, str] | None = None
247
- # An approximate maximum request size for the filestream API.
248
- #
249
- # This applies when wandb-core is enabled. Its purpose is to prevent
250
- # HTTP requests from failing due to containing too much data.
251
- #
252
- # This number is approximate: requests will be slightly larger.
428
+ """Additional headers to add to all outgoing HTTP requests."""
429
+
253
430
  x_file_stream_max_bytes: int | None = None
254
- # Max line length for filestream jsonl files.
431
+ """An approximate maximum request size for the filestream API.
432
+
433
+ Its purpose is to prevent HTTP requests from failing due to
434
+ containing too much data. This number is approximate:
435
+ requests will be slightly larger.
436
+ """
437
+
255
438
  x_file_stream_max_line_bytes: int | None = None
256
- # Interval in seconds between filestream transmissions.
439
+ """Maximum line length for filestream JSONL files."""
440
+
257
441
  x_file_stream_transmit_interval: float | None = None
442
+ """Interval in seconds between filestream transmissions."""
443
+
258
444
  # Filestream retry client configuration.
259
- # max number of retries
445
+
260
446
  x_file_stream_retry_max: int | None = None
261
- # min wait time between retries
447
+ """Max number of retries for filestream operations."""
448
+
262
449
  x_file_stream_retry_wait_min_seconds: float | None = None
263
- # max wait time between retries
450
+ """Minimum wait time between retries for filestream operations."""
451
+
264
452
  x_file_stream_retry_wait_max_seconds: float | None = None
265
- # timeout for individual HTTP requests
453
+ """Maximum wait time between retries for filestream operations."""
454
+
266
455
  x_file_stream_timeout_seconds: float | None = None
456
+ """Timeout in seconds for individual filestream HTTP requests."""
457
+
267
458
  # file transfer retry client configuration
459
+
268
460
  x_file_transfer_retry_max: int | None = None
461
+ """Max number of retries for file transfer operations."""
462
+
269
463
  x_file_transfer_retry_wait_min_seconds: float | None = None
464
+ """Minimum wait time between retries for file transfer operations."""
465
+
270
466
  x_file_transfer_retry_wait_max_seconds: float | None = None
467
+ """Maximum wait time between retries for file transfer operations."""
468
+
271
469
  x_file_transfer_timeout_seconds: float | None = None
272
- # override setting for the computed files_dir
470
+ """Timeout in seconds for individual file transfer HTTP requests."""
471
+
273
472
  x_files_dir: str | None = None
274
- # flow control configuration for file stream
473
+ """Override setting for the computed files_dir.."""
474
+
275
475
  x_flow_control_custom: bool | None = None
476
+ """Flag indicating custom flow control for filestream.
477
+
478
+ TODO: Not implemented in wandb-core.
479
+ """
480
+
276
481
  x_flow_control_disabled: bool | None = None
482
+ """Flag indicating flow control is disabled for filestream.
483
+
484
+ TODO: Not implemented in wandb-core.
485
+ """
486
+
277
487
  # graphql retry client configuration
488
+
278
489
  x_graphql_retry_max: int | None = None
490
+ """Max number of retries for GraphQL operations."""
491
+
279
492
  x_graphql_retry_wait_min_seconds: float | None = None
493
+ """Minimum wait time between retries for GraphQL operations."""
494
+
280
495
  x_graphql_retry_wait_max_seconds: float | None = None
496
+ """Maximum wait time between retries for GraphQL operations."""
497
+
281
498
  x_graphql_timeout_seconds: float | None = None
499
+ """Timeout in seconds for individual GraphQL requests."""
500
+
282
501
  x_internal_check_process: float = 8.0
502
+ """Interval for internal process health checks in seconds."""
503
+
283
504
  x_jupyter_name: str | None = None
505
+ """Name of the Jupyter notebook."""
506
+
284
507
  x_jupyter_path: str | None = None
508
+ """Path to the Jupyter notebook."""
509
+
285
510
  x_jupyter_root: str | None = None
286
- # Label to assign to system metrics and console logs collected for the run
287
- # to group by on the frontend. Can be used to distinguish data from different
288
- # nodes in a distributed training job.
511
+ """Root directory of the Jupyter notebook."""
512
+
289
513
  x_label: str | None = None
514
+ """Label to assign to system metrics and console logs collected for the run.
515
+
516
+ This is used to group data by on the frontend and can be used to distinguish data
517
+ from different processes in a distributed training job.
518
+ """
519
+
290
520
  x_live_policy_rate_limit: int | None = None
521
+ """Rate limit for live policy updates in seconds."""
522
+
291
523
  x_live_policy_wait_time: int | None = None
524
+ """Wait time between live policy updates in seconds."""
525
+
292
526
  x_log_level: int = logging.INFO
527
+ """Logging level for internal operations."""
528
+
293
529
  x_network_buffer: int | None = None
294
- # Determines whether to save internal wandb files and metadata.
295
- # In a distributed setting, this is useful for avoiding file overwrites on secondary nodes
296
- # when only system metrics and logs are needed, as the primary node handles the main logging.
297
- x_primary_node: bool = True
298
- # [deprecated, use http(s)_proxy] custom proxy servers for the requests to W&B
299
- # [scheme -> url].
530
+ """Size of the network buffer used in flow control.
531
+
532
+ TODO: Not implemented in wandb-core.
533
+ """
534
+
535
+ x_primary: bool = Field(
536
+ default=True, validation_alias=AliasChoices("x_primary", "x_primary_node")
537
+ )
538
+ """Determines whether to save internal wandb files and metadata.
539
+
540
+ In a distributed setting, this is useful for avoiding file overwrites
541
+ from secondary processes when only system metrics and logs are needed,
542
+ as the primary process handles the main logging.
543
+ """
544
+
300
545
  x_proxies: dict[str, str] | None = None
546
+ """Custom proxy servers for requests to W&B.
547
+
548
+ This is deprecated and will be removed in future versions.
549
+ Please use `http_proxy` and `https_proxy` instead.
550
+ """
551
+
301
552
  x_runqueue_item_id: str | None = None
553
+ """ID of the Launch run queue item being processed."""
554
+
302
555
  x_require_legacy_service: bool = False
556
+ """Force the use of legacy wandb service."""
557
+
303
558
  x_save_requirements: bool = True
559
+ """Flag to save the requirements file."""
560
+
304
561
  x_service_transport: str | None = None
562
+ """Transport method for communication with the wandb service."""
563
+
305
564
  x_service_wait: float = 30.0
565
+ """Time in seconds to wait for the wandb-core internal service to start."""
566
+
306
567
  x_show_operation_stats: bool = True
307
- # The start time of the run in seconds since the Unix epoch.
568
+ """Whether to show statistics about internal operations such as data uploads."""
569
+
308
570
  x_start_time: float | None = None
309
- # PID of the process that started the wandb-core process to collect system stats for.
571
+ """The start time of the run in seconds since the Unix epoch."""
572
+
310
573
  x_stats_pid: int = os.getpid()
311
- # Sampling interval for the system monitor in seconds.
574
+ """PID of the process that started the wandb-core process to collect system stats for."""
575
+
312
576
  x_stats_sampling_interval: float = Field(default=10.0)
313
- # Path to store the default config file for the neuron-monitor tool
314
- # used to monitor AWS Trainium devices.
577
+ """Sampling interval for the system monitor in seconds."""
578
+
315
579
  x_stats_neuron_monitor_config_path: str | None = None
316
- # Open metrics endpoint names and urls.
580
+ """Path to the default config file for the neuron-monitor tool.
581
+
582
+ This is used to monitor AWS Trainium devices.
583
+ """
584
+
585
+ x_stats_dcgm_exporter: str | None = None
586
+ """Endpoint to extract Nvidia DCGM metrics from.
587
+
588
+ Two options are supported:
589
+ - Extract DCGM-related metrics from a query to the Prometheus `/api/v1/query` endpoint.
590
+ It is a common practice to aggregate metrics reported by the instances of the DCGM Exporter
591
+ running on different nodes in a cluster using Prometheus.
592
+ - TODO: Parse metrics directly from the `/metrics` endpoint of the DCGM Exporter.
593
+
594
+ Examples:
595
+ - `http://localhost:9400/api/v1/query?query=DCGM_FI_DEV_GPU_TEMP{node="l1337", cluster="globular"}`.
596
+ - TODO: `http://192.168.0.1:9400/metrics`.
597
+ """
598
+
317
599
  x_stats_open_metrics_endpoints: dict[str, str] | None = None
318
- # Filter to apply to metrics collected from OpenMetrics endpoints.
319
- # Supports two formats:
320
- # - {"metric regex pattern, including endpoint name as prefix": {"label": "label value regex pattern"}}
321
- # - ("metric regex pattern 1", "metric regex pattern 2", ...)
600
+ """OpenMetrics `/metrics` endpoints to monitor for system metrics."""
601
+
322
602
  x_stats_open_metrics_filters: dict[str, dict[str, str]] | Sequence[str] | None = (
323
603
  None
324
604
  )
325
- # HTTP headers to add to OpenMetrics requests.
605
+ """Filter to apply to metrics collected from OpenMetrics `/metrics` endpoints.
606
+
607
+ Supports two formats:
608
+ - {"metric regex pattern, including endpoint name as prefix": {"label": "label value regex pattern"}}
609
+ - ("metric regex pattern 1", "metric regex pattern 2", ...)
610
+ """
611
+
326
612
  x_stats_open_metrics_http_headers: dict[str, str] | None = None
327
- # System paths to monitor for disk usage.
613
+ """HTTP headers to add to OpenMetrics requests."""
614
+
328
615
  x_stats_disk_paths: Sequence[str] | None = Field(
329
616
  default_factory=lambda: ("/", "/System/Volumes/Data")
330
617
  if platform.system() == "Darwin"
331
618
  else ("/",)
332
619
  )
333
- # GPU device indices to monitor (e.g. [0, 1, 2]).
334
- # If not set, captures metrics for all GPUs.
335
- # Assumes 0-based indexing matching CUDA/ROCm device enumeration.
620
+ """System paths to monitor for disk usage."""
621
+
336
622
  x_stats_gpu_device_ids: Sequence[int] | None = None
337
- # Number of system metric samples to buffer in memory in the wandb-core process.
338
- # Can be accessed via run._system_metrics.
623
+ """GPU device indices to monitor.
624
+
625
+ If not set, captures metrics for all GPUs.
626
+ Assumes 0-based indexing matching CUDA/ROCm device enumeration.
627
+ """
628
+
339
629
  x_stats_buffer_size: int = 0
340
- # Flag to indicate whether we are syncing a run from the transaction log.
630
+ """Number of system metric samples to buffer in memory in the wandb-core process.
631
+
632
+ Can be accessed via run._system_metrics.
633
+ """
634
+
341
635
  x_sync: bool = False
342
- # Controls whether this process can update the run's final state (finished/failed) on the server.
343
- # Set to False in distributed training when only the main process should determine the final state.
636
+ """Flag to indicate whether we are syncing a run from the transaction log."""
637
+
344
638
  x_update_finish_state: bool = True
639
+ """Flag to indicate whether this process can update the run's final state on the server.
640
+
641
+ Set to False in distributed training when only the main process should determine the final state.
642
+ """
345
643
 
346
644
  # Model validator to catch legacy settings.
347
645
  @model_validator(mode="before")
@@ -1033,7 +1331,8 @@ class Settings(BaseModel, validate_assignment=True):
1033
1331
  ):
1034
1332
  self.save_code = env.should_save_code()
1035
1333
 
1036
- self.disable_git = env.disable_git()
1334
+ if os.getenv(env.DISABLE_GIT) is not None:
1335
+ self.disable_git = env.disable_git()
1037
1336
 
1038
1337
  # Attempt to get notebook information if not already set by the user
1039
1338
  if self._jupyter and (self.notebook_name is None or self.notebook_name == ""):
@@ -1055,8 +1354,8 @@ class Settings(BaseModel, validate_assignment=True):
1055
1354
  f"couldn't find {self.notebook_name}.",
1056
1355
  )
1057
1356
 
1058
- # host and username are populated by apply_env_vars if corresponding env
1059
- # vars exist -- but if they don't, we'll fill them in here
1357
+ # host is populated by update_from_env_vars if the corresponding env
1358
+ # vars exist -- but if they don't, we'll fill them in here.
1060
1359
  if self.host is None:
1061
1360
  self.host = socket.gethostname() # type: ignore
1062
1361
 
@@ -1079,8 +1378,15 @@ class Settings(BaseModel, validate_assignment=True):
1079
1378
  program = self.program or self._get_program()
1080
1379
 
1081
1380
  if program is not None:
1082
- repo = GitRepo()
1083
- root = repo.root or os.getcwd()
1381
+ try:
1382
+ root = (
1383
+ GitRepo().root or os.getcwd()
1384
+ if not self.disable_git
1385
+ else os.getcwd()
1386
+ )
1387
+ except Exception:
1388
+ # if the git command fails, fall back to the current working directory
1389
+ root = os.getcwd()
1084
1390
 
1085
1391
  self.program_relpath = self.program_relpath or self._get_program_relpath(
1086
1392
  program, root