wandb 0.16.6__py3-none-any.whl → 0.17.0rc2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- package_readme.md +95 -0
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +0 -1
- wandb/analytics/sentry.py +2 -1
- wandb/apis/importers/internals/protocols.py +30 -56
- wandb/apis/importers/mlflow.py +13 -26
- wandb/apis/importers/wandb.py +8 -14
- wandb/apis/public/api.py +1 -0
- wandb/apis/public/artifacts.py +1 -0
- wandb/apis/public/files.py +1 -0
- wandb/apis/public/history.py +1 -0
- wandb/apis/public/jobs.py +1 -0
- wandb/apis/public/projects.py +1 -0
- wandb/apis/public/reports.py +1 -0
- wandb/apis/public/runs.py +1 -0
- wandb/apis/public/sweeps.py +1 -0
- wandb/apis/public/teams.py +1 -0
- wandb/apis/public/users.py +1 -0
- wandb/apis/reports/v1/_blocks.py +3 -7
- wandb/apis/reports/v2/gql.py +1 -0
- wandb/apis/reports/v2/interface.py +3 -4
- wandb/apis/reports/v2/internal.py +5 -8
- wandb/cli/cli.py +2 -2
- wandb/data_types.py +9 -6
- wandb/docker/__init__.py +1 -1
- wandb/env.py +38 -8
- wandb/errors/__init__.py +5 -0
- wandb/integration/catboost/catboost.py +1 -1
- wandb/integration/fastai/__init__.py +1 -0
- wandb/integration/huggingface/resolver.py +2 -2
- wandb/integration/keras/__init__.py +1 -0
- wandb/integration/keras/callbacks/metrics_logger.py +1 -1
- wandb/integration/keras/keras.py +7 -7
- wandb/integration/langchain/wandb_tracer.py +1 -0
- wandb/integration/lightning/fabric/logger.py +1 -3
- wandb/integration/metaflow/metaflow.py +41 -6
- wandb/integration/openai/fine_tuning.py +3 -3
- wandb/keras/__init__.py +1 -0
- wandb/old/summary.py +1 -1
- wandb/plot/confusion_matrix.py +1 -1
- wandb/plots/precision_recall.py +1 -1
- wandb/plots/roc.py +1 -1
- wandb/proto/v3/wandb_internal_pb2.py +364 -332
- wandb/proto/v3/wandb_settings_pb2.py +1 -1
- wandb/proto/v4/wandb_internal_pb2.py +322 -316
- wandb/proto/v4/wandb_settings_pb2.py +1 -1
- wandb/proto/wandb_internal_codegen.py +0 -25
- wandb/sdk/artifacts/artifact.py +16 -4
- wandb/sdk/artifacts/artifact_download_logger.py +1 -0
- wandb/sdk/artifacts/artifact_file_cache.py +18 -4
- wandb/sdk/artifacts/artifact_instance_cache.py +1 -0
- wandb/sdk/artifacts/artifact_manifest.py +1 -0
- wandb/sdk/artifacts/artifact_manifest_entry.py +1 -0
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
- wandb/sdk/artifacts/artifact_saver.py +5 -2
- wandb/sdk/artifacts/artifact_state.py +1 -0
- wandb/sdk/artifacts/artifact_ttl.py +1 -0
- wandb/sdk/artifacts/exceptions.py +1 -0
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +13 -18
- wandb/sdk/artifacts/storage_handlers/http_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +5 -3
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +1 -0
- wandb/sdk/artifacts/storage_policy.py +1 -0
- wandb/sdk/data_types/_dtypes.py +8 -8
- wandb/sdk/data_types/base_types/media.py +3 -6
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
- wandb/sdk/data_types/image.py +1 -1
- wandb/sdk/data_types/video.py +1 -1
- wandb/sdk/integration_utils/auto_logging.py +5 -6
- wandb/sdk/integration_utils/data_logging.py +10 -6
- wandb/sdk/interface/interface.py +55 -32
- wandb/sdk/interface/interface_shared.py +7 -13
- wandb/sdk/internal/datastore.py +1 -1
- wandb/sdk/internal/handler.py +18 -2
- wandb/sdk/internal/internal.py +0 -1
- wandb/sdk/internal/internal_util.py +0 -1
- wandb/sdk/internal/job_builder.py +5 -4
- wandb/sdk/internal/profiler.py +1 -0
- wandb/sdk/internal/run.py +1 -0
- wandb/sdk/internal/sender.py +1 -1
- wandb/sdk/internal/system/assets/gpu_amd.py +44 -44
- wandb/sdk/internal/system/assets/gpu_apple.py +56 -11
- wandb/sdk/internal/system/assets/interfaces.py +6 -8
- wandb/sdk/internal/system/assets/open_metrics.py +2 -2
- wandb/sdk/internal/system/assets/trainium.py +1 -3
- wandb/sdk/launch/_project_spec.py +8 -4
- wandb/sdk/launch/agent/agent.py +2 -1
- wandb/sdk/launch/agent/config.py +72 -11
- wandb/sdk/launch/builder/abstract.py +2 -1
- wandb/sdk/launch/builder/build.py +29 -2
- wandb/sdk/launch/builder/docker_builder.py +1 -0
- wandb/sdk/launch/builder/kaniko_builder.py +2 -2
- wandb/sdk/launch/builder/noop.py +1 -0
- wandb/sdk/launch/create_job.py +18 -0
- wandb/sdk/launch/environment/abstract.py +1 -0
- wandb/sdk/launch/environment/gcp_environment.py +1 -0
- wandb/sdk/launch/environment/local_environment.py +1 -0
- wandb/sdk/launch/loader.py +1 -0
- wandb/sdk/launch/registry/abstract.py +1 -0
- wandb/sdk/launch/registry/azure_container_registry.py +1 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +1 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +2 -1
- wandb/sdk/launch/registry/local_registry.py +1 -0
- wandb/sdk/launch/runner/abstract.py +1 -0
- wandb/sdk/launch/runner/kubernetes_monitor.py +1 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +4 -3
- wandb/sdk/launch/runner/sagemaker_runner.py +11 -10
- wandb/sdk/launch/sweeps/scheduler.py +4 -3
- wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
- wandb/sdk/launch/sweeps/utils.py +3 -3
- wandb/sdk/launch/utils.py +3 -3
- wandb/sdk/lib/fsm.py +8 -12
- wandb/sdk/lib/gitlib.py +4 -4
- wandb/sdk/lib/import_hooks.py +1 -1
- wandb/sdk/lib/lazyloader.py +0 -1
- wandb/sdk/lib/proto_util.py +1 -1
- wandb/sdk/lib/redirect.py +19 -14
- wandb/sdk/lib/retry.py +3 -2
- wandb/sdk/lib/tracelog.py +1 -1
- wandb/sdk/service/service.py +17 -15
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_manager.py +2 -2
- wandb/sdk/wandb_require.py +5 -0
- wandb/sdk/wandb_run.py +25 -20
- wandb/sdk/wandb_settings.py +0 -1
- wandb/sdk/wandb_setup.py +1 -1
- wandb/sklearn/__init__.py +1 -0
- wandb/sklearn/plot/__init__.py +1 -0
- wandb/sklearn/plot/classifier.py +7 -6
- wandb/sklearn/plot/clusterer.py +2 -1
- wandb/sklearn/plot/regressor.py +1 -0
- wandb/sklearn/plot/shared.py +1 -0
- wandb/sklearn/utils.py +1 -0
- wandb/testing/relay.py +4 -4
- wandb/trigger.py +1 -0
- wandb/util.py +40 -17
- wandb/wandb_controller.py +2 -3
- wandb/wandb_torch.py +1 -2
- {wandb-0.16.6.dist-info → wandb-0.17.0rc2.dist-info}/METADATA +68 -69
- {wandb-0.16.6.dist-info → wandb-0.17.0rc2.dist-info}/RECORD +149 -150
- {wandb-0.16.6.dist-info → wandb-0.17.0rc2.dist-info}/WHEEL +1 -2
- wandb/bin/apple_gpu_stats +0 -0
- wandb-0.16.6.dist-info/top_level.txt +0 -1
- {wandb-0.16.6.dist-info → wandb-0.17.0rc2.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.6.dist-info → wandb-0.17.0rc2.dist-info/licenses}/LICENSE +0 -0
@@ -78,7 +78,7 @@ class ValidationDataLogger:
|
|
78
78
|
Defaults to `"wb_validation_data"`.
|
79
79
|
artifact_type: The artifact type to use for the validation data.
|
80
80
|
Defaults to `"validation_dataset"`.
|
81
|
-
class_labels: Optional list of
|
81
|
+
class_labels: Optional list of labels to use in the inferred
|
82
82
|
processors. If the model's `target` or `output` is inferred to be a class,
|
83
83
|
we will attempt to map the class to these labels. Defaults to `None`.
|
84
84
|
infer_missing_processors: Determines if processors are inferred if
|
@@ -262,7 +262,7 @@ def _infer_single_example_keyed_processor(
|
|
262
262
|
):
|
263
263
|
np = wandb.util.get_module(
|
264
264
|
"numpy",
|
265
|
-
required="
|
265
|
+
required="Inferring processors require numpy",
|
266
266
|
)
|
267
267
|
# Assume these are logits
|
268
268
|
class_names = class_labels_table.get_column("label")
|
@@ -291,13 +291,17 @@ def _infer_single_example_keyed_processor(
|
|
291
291
|
):
|
292
292
|
# assume this is a class
|
293
293
|
if class_labels_table is not None:
|
294
|
-
processors["class"] =
|
294
|
+
processors["class"] = (
|
295
|
+
lambda n, d, p: class_labels_table.index_ref(d[0])
|
296
|
+
if d[0] < len(class_labels_table.data)
|
297
|
+
else d[0]
|
298
|
+
) # type: ignore
|
295
299
|
else:
|
296
300
|
processors["val"] = lambda n, d, p: d[0]
|
297
301
|
elif len(shape) == 1:
|
298
302
|
np = wandb.util.get_module(
|
299
303
|
"numpy",
|
300
|
-
required="
|
304
|
+
required="Inferring processors require numpy",
|
301
305
|
)
|
302
306
|
# This could be anything
|
303
307
|
if shape[0] <= 10:
|
@@ -350,7 +354,7 @@ def _infer_validation_row_processor(
|
|
350
354
|
input_col_name: str = "input",
|
351
355
|
target_col_name: str = "target",
|
352
356
|
) -> Callable:
|
353
|
-
"""Infers the
|
357
|
+
"""Infers the composite processor for the validation data."""
|
354
358
|
single_processors = {}
|
355
359
|
if isinstance(example_input, dict):
|
356
360
|
for key in example_input:
|
@@ -427,7 +431,7 @@ def _infer_prediction_row_processor(
|
|
427
431
|
input_col_name: str = "input",
|
428
432
|
output_col_name: str = "output",
|
429
433
|
) -> Callable:
|
430
|
-
"""Infers the
|
434
|
+
"""Infers the composite processor for the prediction output data."""
|
431
435
|
single_processors = {}
|
432
436
|
|
433
437
|
if isinstance(example_prediction, dict):
|
wandb/sdk/interface/interface.py
CHANGED
@@ -387,7 +387,7 @@ class InterfaceBase:
|
|
387
387
|
def _make_partial_source_str(
|
388
388
|
source: Any, job_info: Dict[str, Any], metadata: Dict[str, Any]
|
389
389
|
) -> str:
|
390
|
-
"""Construct use_artifact.partial.source_info.
|
390
|
+
"""Construct use_artifact.partial.source_info.source as str."""
|
391
391
|
source_type = job_info.get("source_type", "").strip()
|
392
392
|
if source_type == "artifact":
|
393
393
|
info_source = job_info.get("source", {})
|
@@ -424,7 +424,7 @@ class InterfaceBase:
|
|
424
424
|
job_info=job_info,
|
425
425
|
metadata=metadata,
|
426
426
|
)
|
427
|
-
use_artifact.partial.source_info.source.ParseFromString(src_str)
|
427
|
+
use_artifact.partial.source_info.source.ParseFromString(src_str) # type: ignore[arg-type]
|
428
428
|
|
429
429
|
return use_artifact
|
430
430
|
|
@@ -516,11 +516,15 @@ class InterfaceBase:
|
|
516
516
|
artifact_id: str,
|
517
517
|
download_root: str,
|
518
518
|
allow_missing_references: bool,
|
519
|
+
skip_cache: bool,
|
520
|
+
path_prefix: Optional[str],
|
519
521
|
) -> MailboxHandle:
|
520
522
|
download_artifact = pb.DownloadArtifactRequest()
|
521
523
|
download_artifact.artifact_id = artifact_id
|
522
524
|
download_artifact.download_root = download_root
|
523
525
|
download_artifact.allow_missing_references = allow_missing_references
|
526
|
+
download_artifact.skip_cache = skip_cache
|
527
|
+
download_artifact.path_prefix = path_prefix or ""
|
524
528
|
resp = self._deliver_download_artifact(download_artifact)
|
525
529
|
return resp
|
526
530
|
|
@@ -729,6 +733,55 @@ class InterfaceBase:
|
|
729
733
|
def _publish_keepalive(self, keepalive: pb.KeepaliveRequest) -> None:
|
730
734
|
raise NotImplementedError
|
731
735
|
|
736
|
+
def publish_job_input(
|
737
|
+
self,
|
738
|
+
include_paths: List[List[str]],
|
739
|
+
exclude_paths: List[List[str]],
|
740
|
+
run_config: bool = False,
|
741
|
+
file_path: str = "",
|
742
|
+
):
|
743
|
+
"""Publishes a request to add inputs to the job.
|
744
|
+
|
745
|
+
If run_config is True, the wandb.config will be added as a job input.
|
746
|
+
If file_path is provided, the file at file_path will be added as a job
|
747
|
+
input.
|
748
|
+
|
749
|
+
The paths provided as arguments are sequences of dictionary keys that
|
750
|
+
specify a path within the wandb.config. If a path is included, the
|
751
|
+
corresponding field will be treated as a job input. If a path is
|
752
|
+
excluded, the corresponding field will not be treated as a job input.
|
753
|
+
|
754
|
+
Args:
|
755
|
+
include_paths: paths within config to include as job inputs.
|
756
|
+
exclude_paths: paths within config to exclude as job inputs.
|
757
|
+
run_config: bool indicating whether wandb.config is the input source.
|
758
|
+
file_path: path to file to include as a job input.
|
759
|
+
"""
|
760
|
+
if run_config and file_path:
|
761
|
+
raise ValueError(
|
762
|
+
"run_config and file_path are mutually exclusive arguments."
|
763
|
+
)
|
764
|
+
request = pb.JobInputRequest()
|
765
|
+
include_records = [pb.JobInputPath(path=path) for path in include_paths]
|
766
|
+
exclude_records = [pb.JobInputPath(path=path) for path in exclude_paths]
|
767
|
+
request.include_paths.extend(include_records)
|
768
|
+
request.exclude_paths.extend(exclude_records)
|
769
|
+
source = pb.JobInputSource(
|
770
|
+
run_config=pb.JobInputSource.RunConfigSource(),
|
771
|
+
)
|
772
|
+
if run_config:
|
773
|
+
source.run_config.CopyFrom(pb.JobInputSource.RunConfigSource())
|
774
|
+
else:
|
775
|
+
source.file.CopyFrom(
|
776
|
+
pb.JobInputSource.ConfigFileSource(path=file_path),
|
777
|
+
)
|
778
|
+
|
779
|
+
return self._publish_job_input(request)
|
780
|
+
|
781
|
+
@abstractmethod
|
782
|
+
def _publish_job_input(self, request: pb.JobInputRequest) -> MailboxHandle:
|
783
|
+
raise NotImplementedError
|
784
|
+
|
732
785
|
def join(self) -> None:
|
733
786
|
# Drop indicates that the internal process has already been shutdown
|
734
787
|
if self._drop:
|
@@ -779,36 +832,6 @@ class InterfaceBase:
|
|
779
832
|
run_start.run.CopyFrom(run_pb)
|
780
833
|
return self._deliver_run_start(run_start)
|
781
834
|
|
782
|
-
def publish_launch_wandb_config_parameters(
|
783
|
-
self, include_paths: List[List[str]], exclude_paths: List[List[str]]
|
784
|
-
):
|
785
|
-
"""Tells the internal process to treat wandb.config fields as job inputs.
|
786
|
-
|
787
|
-
The paths provided as arguments are sequences of dictionary keys that
|
788
|
-
specify a path within the wandb.config. If a path is included, the
|
789
|
-
corresponding field will be treated as a job input. If a path is
|
790
|
-
excluded, the corresponding field will not be treated as a job input.
|
791
|
-
|
792
|
-
Args:
|
793
|
-
include_paths: paths within config to include as job inputs.
|
794
|
-
exclude_paths: paths within config to exclude as job inputs.
|
795
|
-
|
796
|
-
Returns:
|
797
|
-
None
|
798
|
-
"""
|
799
|
-
config_parameters = pb.LaunchWandbConfigParametersRecord()
|
800
|
-
include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
|
801
|
-
exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
|
802
|
-
config_parameters.include_paths.extend(include_records)
|
803
|
-
config_parameters.exclude_paths.extend(exclude_records)
|
804
|
-
return self._publish_launch_wandb_config_parameters(config_parameters)
|
805
|
-
|
806
|
-
@abstractmethod
|
807
|
-
def _publish_launch_wandb_config_parameters(
|
808
|
-
self, config_parameters: pb.LaunchWandbConfigParametersRecord
|
809
|
-
) -> None:
|
810
|
-
raise NotImplementedError
|
811
|
-
|
812
835
|
@abstractmethod
|
813
836
|
def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
|
814
837
|
raise NotImplementedError
|
@@ -100,6 +100,10 @@ class InterfaceShared(InterfaceBase):
|
|
100
100
|
rec = self._make_record(telemetry=telem)
|
101
101
|
self._publish(rec)
|
102
102
|
|
103
|
+
def _publish_job_input(self, job_input: pb.JobInputRequest) -> MailboxHandle:
|
104
|
+
record = self._make_request(job_input=job_input)
|
105
|
+
return self._deliver_record(record)
|
106
|
+
|
103
107
|
def _make_stats(self, stats_dict: dict) -> pb.StatsRecord:
|
104
108
|
stats = pb.StatsRecord()
|
105
109
|
stats.stats_type = pb.StatsRecord.StatsType.SYSTEM
|
@@ -147,6 +151,7 @@ class InterfaceShared(InterfaceBase):
|
|
147
151
|
telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
|
148
152
|
get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
|
149
153
|
python_packages: Optional[pb.PythonPackagesRequest] = None,
|
154
|
+
job_input: Optional[pb.JobInputRequest] = None,
|
150
155
|
) -> pb.Record:
|
151
156
|
request = pb.Request()
|
152
157
|
if login:
|
@@ -207,6 +212,8 @@ class InterfaceShared(InterfaceBase):
|
|
207
212
|
request.sync.CopyFrom(sync)
|
208
213
|
elif python_packages:
|
209
214
|
request.python_packages.CopyFrom(python_packages)
|
215
|
+
elif job_input:
|
216
|
+
request.job_input.CopyFrom(job_input)
|
210
217
|
else:
|
211
218
|
raise Exception("Invalid request")
|
212
219
|
record = self._make_record(request=request)
|
@@ -239,9 +246,6 @@ class InterfaceShared(InterfaceBase):
|
|
239
246
|
use_artifact: Optional[pb.UseArtifactRecord] = None,
|
240
247
|
output: Optional[pb.OutputRecord] = None,
|
241
248
|
output_raw: Optional[pb.OutputRawRecord] = None,
|
242
|
-
launch_wandb_config_parameters: Optional[
|
243
|
-
pb.LaunchWandbConfigParametersRecord
|
244
|
-
] = None,
|
245
249
|
) -> pb.Record:
|
246
250
|
record = pb.Record()
|
247
251
|
if run:
|
@@ -286,8 +290,6 @@ class InterfaceShared(InterfaceBase):
|
|
286
290
|
record.output.CopyFrom(output)
|
287
291
|
elif output_raw:
|
288
292
|
record.output_raw.CopyFrom(output_raw)
|
289
|
-
elif launch_wandb_config_parameters:
|
290
|
-
record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
|
291
293
|
else:
|
292
294
|
raise Exception("Invalid record")
|
293
295
|
return record
|
@@ -417,14 +419,6 @@ class InterfaceShared(InterfaceBase):
|
|
417
419
|
rec = self._make_record(alert=proto_alert)
|
418
420
|
self._publish(rec)
|
419
421
|
|
420
|
-
def _publish_launch_wandb_config_parameters(
|
421
|
-
self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
|
422
|
-
) -> None:
|
423
|
-
rec = self._make_record(
|
424
|
-
launch_wandb_config_parameters=launch_wandb_config_parameters
|
425
|
-
)
|
426
|
-
self._publish(rec)
|
427
|
-
|
428
422
|
def _communicate_status(
|
429
423
|
self, status: pb.StatusRequest
|
430
424
|
) -> Optional[pb.StatusResponse]:
|
wandb/sdk/internal/datastore.py
CHANGED
wandb/sdk/internal/handler.py
CHANGED
@@ -50,6 +50,18 @@ SummaryDict = Dict[str, Any]
|
|
50
50
|
|
51
51
|
logger = logging.getLogger(__name__)
|
52
52
|
|
53
|
+
# Update (March 5, 2024): Since ~2020/2021, when constructing the summary
|
54
|
+
# object, we had replaced the artifact path for media types with the latest
|
55
|
+
# artifact path. The primary purpose of this was to support live updating of
|
56
|
+
# media objects in the UI (since the default artifact path was fully qualified
|
57
|
+
# and would not update). However, in March of 2024, a bug was discovered with
|
58
|
+
# this approach which causes this path to be incorrect in cases where the media
|
59
|
+
# object is logged to another artifact before being logged to the run. Setting
|
60
|
+
# this to `False` disables this copy behavior. The impact is that users will
|
61
|
+
# need to refresh to see updates. Ironically, this updating behavior is not
|
62
|
+
# currently supported in the UI, so the impact of this change is minimal.
|
63
|
+
REPLACE_SUMMARY_ART_PATH_WITH_LATEST = False
|
64
|
+
|
53
65
|
|
54
66
|
def _dict_nested_set(target: Dict[str, Any], key_list: Sequence[str], v: Any) -> None:
|
55
67
|
# recurse down the dictionary structure:
|
@@ -371,7 +383,11 @@ class HandleManager:
|
|
371
383
|
updated = True
|
372
384
|
return updated
|
373
385
|
# If the dict is a media object, update the pointer to the latest alias
|
374
|
-
elif
|
386
|
+
elif (
|
387
|
+
REPLACE_SUMMARY_ART_PATH_WITH_LATEST
|
388
|
+
and isinstance(v, dict)
|
389
|
+
and handler_util.metric_is_wandb_dict(v)
|
390
|
+
):
|
375
391
|
if "_latest_artifact_path" in v and "artifact_path" in v:
|
376
392
|
# TODO: Make non-destructive?
|
377
393
|
v["artifact_path"] = v["_latest_artifact_path"]
|
@@ -381,7 +397,7 @@ class HandleManager:
|
|
381
397
|
def _update_summary_media_objects(self, v: Dict[str, Any]) -> Dict[str, Any]:
|
382
398
|
# For now, non-recursive - just top level
|
383
399
|
for nk, nv in v.items():
|
384
|
-
if (
|
400
|
+
if REPLACE_SUMMARY_ART_PATH_WITH_LATEST and (
|
385
401
|
isinstance(nv, dict)
|
386
402
|
and handler_util.metric_is_wandb_dict(nv)
|
387
403
|
and "_latest_artifact_path" in nv
|
wandb/sdk/internal/internal.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""job builder."""
|
2
|
+
|
2
3
|
import json
|
3
4
|
import logging
|
4
5
|
import os
|
@@ -105,9 +106,9 @@ class JobBuilder:
|
|
105
106
|
self._disable = settings.disable_job_creation
|
106
107
|
self._partial_source = None
|
107
108
|
self._aliases = []
|
108
|
-
self._source_type: Optional[
|
109
|
-
|
110
|
-
|
109
|
+
self._source_type: Optional[Literal["repo", "artifact", "image"]] = (
|
110
|
+
settings.job_source # type: ignore[assignment]
|
111
|
+
)
|
111
112
|
self._is_notebook_run = self._get_is_notebook_run()
|
112
113
|
self._verbose = verbose
|
113
114
|
|
@@ -275,7 +276,7 @@ class JobBuilder:
|
|
275
276
|
return source, name
|
276
277
|
|
277
278
|
def _make_job_name(self, input_str: str) -> str:
|
278
|
-
"""Use job name from settings if provided, else use
|
279
|
+
"""Use job name from settings if provided, else use programmatic name."""
|
279
280
|
if self._settings.job_name:
|
280
281
|
return self._settings.job_name
|
281
282
|
|
wandb/sdk/internal/profiler.py
CHANGED
wandb/sdk/internal/run.py
CHANGED
wandb/sdk/internal/sender.py
CHANGED
@@ -910,7 +910,7 @@ class SendManager:
|
|
910
910
|
is_wandb_init = self._run is None
|
911
911
|
|
912
912
|
# save start time of a run
|
913
|
-
self._start_time = run.start_time.ToMicroseconds() // 1e6
|
913
|
+
self._start_time = int(run.start_time.ToMicroseconds() // 1e6)
|
914
914
|
|
915
915
|
# update telemetry
|
916
916
|
if run.telemetry:
|
@@ -28,14 +28,6 @@ logger = logging.getLogger(__name__)
|
|
28
28
|
ROCM_SMI_CMD: Final[str] = shutil.which("rocm-smi") or "/usr/bin/rocm-smi"
|
29
29
|
|
30
30
|
|
31
|
-
def get_rocm_smi_stats() -> Dict[str, Any]:
|
32
|
-
command = [str(ROCM_SMI_CMD), "-a", "--json"]
|
33
|
-
output = subprocess.check_output(command, universal_newlines=True).strip()
|
34
|
-
if "No AMD GPUs specified" in output:
|
35
|
-
return {}
|
36
|
-
return json.loads(output.split("\n")[0]) # type: ignore
|
37
|
-
|
38
|
-
|
39
31
|
_StatsKeys = Literal[
|
40
32
|
"gpu",
|
41
33
|
"memoryAllocated",
|
@@ -49,6 +41,48 @@ _Stats = Dict[_StatsKeys, float]
|
|
49
41
|
_InfoDict = Dict[str, Union[int, List[Dict[str, Any]]]]
|
50
42
|
|
51
43
|
|
44
|
+
def get_rocm_smi_stats() -> Dict[str, Any]:
|
45
|
+
command = [str(ROCM_SMI_CMD), "-a", "--json"]
|
46
|
+
output = subprocess.check_output(command, universal_newlines=True).strip()
|
47
|
+
if "No AMD GPUs specified" in output:
|
48
|
+
return {}
|
49
|
+
return json.loads(output.split("\n")[0]) # type: ignore
|
50
|
+
|
51
|
+
|
52
|
+
def parse_stats(stats: Dict[str, str]) -> _Stats:
|
53
|
+
"""Parse stats from rocm-smi output."""
|
54
|
+
parsed_stats: _Stats = {}
|
55
|
+
|
56
|
+
try:
|
57
|
+
parsed_stats["gpu"] = float(stats.get("GPU use (%)")) # type: ignore
|
58
|
+
except (TypeError, ValueError):
|
59
|
+
logger.warning("Could not parse GPU usage as float")
|
60
|
+
try:
|
61
|
+
parsed_stats["memoryAllocated"] = float(stats.get("GPU memory use (%)")) # type: ignore
|
62
|
+
except (TypeError, ValueError):
|
63
|
+
logger.warning("Could not parse GPU memory allocation as float")
|
64
|
+
try:
|
65
|
+
parsed_stats["temp"] = float(stats.get("Temperature (Sensor memory) (C)")) # type: ignore
|
66
|
+
except (TypeError, ValueError):
|
67
|
+
logger.warning("Could not parse GPU temperature as float")
|
68
|
+
try:
|
69
|
+
parsed_stats["powerWatts"] = float(
|
70
|
+
stats.get("Average Graphics Package Power (W)") # type: ignore
|
71
|
+
)
|
72
|
+
except (TypeError, ValueError):
|
73
|
+
logger.warning("Could not parse GPU power as float")
|
74
|
+
try:
|
75
|
+
parsed_stats["powerPercent"] = (
|
76
|
+
float(stats.get("Average Graphics Package Power (W)")) # type: ignore
|
77
|
+
/ float(stats.get("Max Graphics Package Power (W)")) # type: ignore
|
78
|
+
* 100
|
79
|
+
)
|
80
|
+
except (TypeError, ValueError):
|
81
|
+
logger.warning("Could not parse GPU average/max power as float")
|
82
|
+
|
83
|
+
return parsed_stats
|
84
|
+
|
85
|
+
|
52
86
|
class GPUAMDStats:
|
53
87
|
"""Stats for AMD GPU devices."""
|
54
88
|
|
@@ -58,40 +92,6 @@ class GPUAMDStats:
|
|
58
92
|
def __init__(self) -> None:
|
59
93
|
self.samples = deque()
|
60
94
|
|
61
|
-
@staticmethod
|
62
|
-
def parse_stats(stats: Dict[str, str]) -> _Stats:
|
63
|
-
"""Parse stats from rocm-smi output."""
|
64
|
-
parsed_stats: _Stats = {}
|
65
|
-
|
66
|
-
try:
|
67
|
-
parsed_stats["gpu"] = float(stats.get("GPU use (%)")) # type: ignore
|
68
|
-
except (TypeError, ValueError):
|
69
|
-
logger.warning("Could not parse GPU usage as float")
|
70
|
-
try:
|
71
|
-
parsed_stats["memoryAllocated"] = float(stats.get("GPU memory use (%)")) # type: ignore
|
72
|
-
except (TypeError, ValueError):
|
73
|
-
logger.warning("Could not parse GPU memory allocation as float")
|
74
|
-
try:
|
75
|
-
parsed_stats["temp"] = float(stats.get("Temperature (Sensor memory) (C)")) # type: ignore
|
76
|
-
except (TypeError, ValueError):
|
77
|
-
logger.warning("Could not parse GPU temperature as float")
|
78
|
-
try:
|
79
|
-
parsed_stats["powerWatts"] = float(
|
80
|
-
stats.get("Average Graphics Package Power (W)") # type: ignore
|
81
|
-
)
|
82
|
-
except (TypeError, ValueError):
|
83
|
-
logger.warning("Could not parse GPU power as float")
|
84
|
-
try:
|
85
|
-
parsed_stats["powerPercent"] = (
|
86
|
-
float(stats.get("Average Graphics Package Power (W)")) # type: ignore
|
87
|
-
/ float(stats.get("Max Graphics Package Power (W)")) # type: ignore
|
88
|
-
* 100
|
89
|
-
)
|
90
|
-
except (TypeError, ValueError):
|
91
|
-
logger.warning("Could not parse GPU average/max power as float")
|
92
|
-
|
93
|
-
return parsed_stats
|
94
|
-
|
95
95
|
def sample(self) -> None:
|
96
96
|
try:
|
97
97
|
raw_stats = get_rocm_smi_stats()
|
@@ -103,7 +103,7 @@ class GPUAMDStats:
|
|
103
103
|
|
104
104
|
for card_key in card_keys:
|
105
105
|
card_stats = raw_stats[card_key]
|
106
|
-
stats =
|
106
|
+
stats = parse_stats(card_stats)
|
107
107
|
if stats:
|
108
108
|
cards.append(stats)
|
109
109
|
|
@@ -183,7 +183,7 @@ class GPUAMD:
|
|
183
183
|
|
184
184
|
can_read_rocm_smi = False
|
185
185
|
try:
|
186
|
-
if get_rocm_smi_stats():
|
186
|
+
if parse_stats(get_rocm_smi_stats()):
|
187
187
|
can_read_rocm_smi = True
|
188
188
|
except Exception:
|
189
189
|
pass
|
@@ -37,6 +37,12 @@ class _Stats(TypedDict):
|
|
37
37
|
# cpuWaitMs: float
|
38
38
|
|
39
39
|
|
40
|
+
def get_apple_gpu_path() -> pathlib.Path:
|
41
|
+
return (
|
42
|
+
pathlib.Path(sys.modules["wandb"].__path__[0]) / "bin" / "apple_gpu_stats"
|
43
|
+
).resolve()
|
44
|
+
|
45
|
+
|
40
46
|
class GPUAppleStats:
|
41
47
|
"""Apple GPU stats available on Arm Macs."""
|
42
48
|
|
@@ -49,9 +55,7 @@ class GPUAppleStats:
|
|
49
55
|
|
50
56
|
def __init__(self) -> None:
|
51
57
|
self.samples = deque()
|
52
|
-
self.binary_path = (
|
53
|
-
pathlib.Path(sys.modules["wandb"].__path__[0]) / "bin" / "apple_gpu_stats"
|
54
|
-
).resolve()
|
58
|
+
self.binary_path = get_apple_gpu_path()
|
55
59
|
|
56
60
|
def sample(self) -> None:
|
57
61
|
try:
|
@@ -63,22 +67,47 @@ class GPUAppleStats:
|
|
63
67
|
)[0]
|
64
68
|
raw_stats = json.loads(output)
|
65
69
|
|
70
|
+
temp_keys = [
|
71
|
+
"m1Gpu1",
|
72
|
+
"m1Gpu2",
|
73
|
+
"m1Gpu3",
|
74
|
+
"m1Gpu4",
|
75
|
+
"m2Gpu1",
|
76
|
+
"m2Gpu2",
|
77
|
+
"m3Gpu1",
|
78
|
+
"m3Gpu2",
|
79
|
+
"m3Gpu3",
|
80
|
+
"m3Gpu4",
|
81
|
+
"m3Gpu5",
|
82
|
+
"m3Gpu6",
|
83
|
+
"m3Gpu7",
|
84
|
+
"m3Gpu8",
|
85
|
+
]
|
86
|
+
temp, count = 0, 0
|
87
|
+
for k in temp_keys:
|
88
|
+
if raw_stats.get(k, 0) > 0:
|
89
|
+
temp += raw_stats[k]
|
90
|
+
count += 1
|
91
|
+
|
66
92
|
stats: _Stats = {
|
67
93
|
"gpu": raw_stats["utilization"],
|
68
|
-
"memoryAllocated":
|
69
|
-
|
70
|
-
|
71
|
-
|
94
|
+
"memoryAllocated": (
|
95
|
+
raw_stats["inUseSystemMemory"]
|
96
|
+
/ raw_stats["allocatedSystemMemory"]
|
97
|
+
* 100
|
98
|
+
),
|
99
|
+
"powerWatts": raw_stats["systemPower"],
|
100
|
+
"powerPercent": (raw_stats["systemPower"] / self.MAX_POWER_WATTS) * 100,
|
101
|
+
"temp": temp / count if count > 0 else 0,
|
72
102
|
# TODO: this stat could be useful eventually, it was consistently
|
73
103
|
# 0 in my experimentation and requires a frontend change
|
74
104
|
# so leaving it out for now.
|
75
105
|
# "cpuWaitMs": raw_stats["cpu_wait_ms"],
|
76
106
|
}
|
77
|
-
|
78
107
|
self.samples.append(stats)
|
79
108
|
|
80
109
|
except (OSError, ValueError, TypeError, subprocess.CalledProcessError) as e:
|
81
|
-
logger.exception(
|
110
|
+
logger.exception("GPU stats error: %s", e)
|
82
111
|
|
83
112
|
def clear(self) -> None:
|
84
113
|
self.samples.clear()
|
@@ -116,6 +145,7 @@ class GPUApple:
|
|
116
145
|
telemetry_record = telemetry.TelemetryRecord()
|
117
146
|
telemetry_record.env.m1_gpu = True
|
118
147
|
interface._publish_telemetry(telemetry_record)
|
148
|
+
self.binary_path = get_apple_gpu_path()
|
119
149
|
|
120
150
|
@classmethod
|
121
151
|
def is_available(cls) -> bool:
|
@@ -128,5 +158,20 @@ class GPUApple:
|
|
128
158
|
self.metrics_monitor.finish()
|
129
159
|
|
130
160
|
def probe(self) -> dict:
|
131
|
-
|
132
|
-
|
161
|
+
try:
|
162
|
+
command = [str(self.binary_path), "--json"]
|
163
|
+
output = (
|
164
|
+
subprocess.check_output(command, universal_newlines=True)
|
165
|
+
.strip()
|
166
|
+
.split("\n")
|
167
|
+
)[0]
|
168
|
+
raw_stats = json.loads(output)
|
169
|
+
return {
|
170
|
+
self.name: {
|
171
|
+
"type": raw_stats["name"],
|
172
|
+
"vendor": raw_stats["vendor"],
|
173
|
+
}
|
174
|
+
}
|
175
|
+
except (OSError, ValueError, TypeError, subprocess.CalledProcessError) as e:
|
176
|
+
logger.exception("GPU stats error: %s", e)
|
177
|
+
return {self.name: {"type": "arm", "vendor": "Apple"}}
|
@@ -68,8 +68,7 @@ class Asset(Protocol):
|
|
68
68
|
metrics: List[Metric]
|
69
69
|
metrics_monitor: "MetricsMonitor"
|
70
70
|
|
71
|
-
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
72
|
-
... # pragma: no cover
|
71
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None: ... # pragma: no cover
|
73
72
|
|
74
73
|
@classmethod
|
75
74
|
def is_available(cls) -> bool:
|
@@ -90,14 +89,13 @@ class Asset(Protocol):
|
|
90
89
|
|
91
90
|
|
92
91
|
class Interface(Protocol):
|
93
|
-
def publish_stats(self, stats: dict) -> None:
|
94
|
-
... # pragma: no cover
|
92
|
+
def publish_stats(self, stats: dict) -> None: ... # pragma: no cover
|
95
93
|
|
96
|
-
def _publish_telemetry(
|
97
|
-
|
94
|
+
def _publish_telemetry(
|
95
|
+
self, telemetry: "TelemetryRecord"
|
96
|
+
) -> None: ... # pragma: no cover
|
98
97
|
|
99
|
-
def publish_files(self, files_dict: "FilesDict") -> None:
|
100
|
-
... # pragma: no cover
|
98
|
+
def publish_files(self, files_dict: "FilesDict") -> None: ... # pragma: no cover
|
101
99
|
|
102
100
|
|
103
101
|
class MetricsMonitor:
|
@@ -65,13 +65,13 @@ def _setup_requests_session() -> requests.Session:
|
|
65
65
|
|
66
66
|
|
67
67
|
def _nested_dict_to_tuple(
|
68
|
-
nested_dict: Mapping[str, Mapping[str, str]]
|
68
|
+
nested_dict: Mapping[str, Mapping[str, str]],
|
69
69
|
) -> Tuple[Tuple[str, Tuple[str, str]], ...]:
|
70
70
|
return tuple((k, *v.items()) for k, v in nested_dict.items()) # type: ignore
|
71
71
|
|
72
72
|
|
73
73
|
def _tuple_to_nested_dict(
|
74
|
-
nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...]
|
74
|
+
nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...],
|
75
75
|
) -> Dict[str, Dict[str, str]]:
|
76
76
|
return {k: dict(v) for k, *v in nested_tuple}
|
77
77
|
|
@@ -197,9 +197,7 @@ class NeuronCoreStats:
|
|
197
197
|
entry["report"]
|
198
198
|
for entry in raw_stats["neuron_runtime_data"]
|
199
199
|
if self._is_matching_entry(entry)
|
200
|
-
][
|
201
|
-
0
|
202
|
-
] # there should be only one entry with the pid
|
200
|
+
][0] # there should be only one entry with the pid
|
203
201
|
|
204
202
|
neuroncores_in_use = neuron_runtime_data["neuroncore_counters"][
|
205
203
|
"neuroncores_in_use"
|