wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/importers/__init__.py +1 -4
- wandb/apis/importers/internals/internal.py +386 -0
- wandb/apis/importers/internals/protocols.py +125 -0
- wandb/apis/importers/internals/util.py +78 -0
- wandb/apis/importers/mlflow.py +125 -88
- wandb/apis/importers/validation.py +108 -0
- wandb/apis/importers/wandb.py +1604 -0
- wandb/apis/public/api.py +7 -10
- wandb/apis/public/artifacts.py +38 -0
- wandb/apis/public/files.py +11 -2
- wandb/apis/reports/v2/__init__.py +0 -19
- wandb/apis/reports/v2/expr_parsing.py +0 -1
- wandb/apis/reports/v2/interface.py +15 -18
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +52 -55
- wandb/integration/gym/__init__.py +2 -1
- wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
- wandb/integration/keras/keras.py +6 -4
- wandb/integration/kfp/kfp_patch.py +2 -2
- wandb/integration/openai/fine_tuning.py +1 -2
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +75 -31
- wandb/sdk/artifacts/artifact_manifest.py +5 -2
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
- wandb/sdk/artifacts/artifact_saver.py +19 -47
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
- wandb/sdk/artifacts/storage_policy.py +4 -1
- wandb/sdk/data_types/base_types/wb_value.py +1 -1
- wandb/sdk/data_types/image.py +2 -2
- wandb/sdk/interface/interface.py +49 -13
- wandb/sdk/interface/interface_shared.py +17 -11
- wandb/sdk/internal/file_stream.py +20 -1
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +3 -1
- wandb/sdk/internal/job_builder.py +49 -19
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/sender.py +96 -124
- wandb/sdk/internal/sender_config.py +197 -0
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +5 -3
- wandb/sdk/internal/update.py +1 -1
- wandb/sdk/launch/_launch.py +3 -3
- wandb/sdk/launch/_launch_add.py +28 -29
- wandb/sdk/launch/_project_spec.py +148 -136
- wandb/sdk/launch/agent/agent.py +3 -7
- wandb/sdk/launch/agent/config.py +0 -27
- wandb/sdk/launch/builder/build.py +54 -28
- wandb/sdk/launch/builder/docker_builder.py +4 -15
- wandb/sdk/launch/builder/kaniko_builder.py +72 -45
- wandb/sdk/launch/create_job.py +6 -40
- wandb/sdk/launch/loader.py +10 -0
- wandb/sdk/launch/registry/anon.py +29 -0
- wandb/sdk/launch/registry/local_registry.py +4 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/runner/local_container.py +15 -10
- wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
- wandb/sdk/launch/sweeps/scheduler.py +11 -3
- wandb/sdk/launch/utils.py +14 -0
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +4 -1
- wandb/sdk/lib/apikey.py +0 -5
- wandb/sdk/lib/config_util.py +0 -31
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +72 -0
- wandb/sdk/service/service.py +7 -2
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_init.py +12 -1
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +164 -110
- wandb/sdk/wandb_settings.py +58 -16
- wandb/testing/relay.py +5 -6
- wandb/util.py +50 -7
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
- wandb/apis/importers/base.py +0 -400
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
"""WandB storage policy."""
|
2
2
|
import hashlib
|
3
3
|
import math
|
4
|
+
import os
|
4
5
|
import shutil
|
5
6
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
|
6
7
|
from urllib.parse import quote
|
@@ -8,12 +9,12 @@ from urllib.parse import quote
|
|
8
9
|
import requests
|
9
10
|
import urllib3
|
10
11
|
|
11
|
-
from wandb.apis import InternalApi
|
12
12
|
from wandb.errors.term import termwarn
|
13
13
|
from wandb.sdk.artifacts.artifact_file_cache import (
|
14
14
|
ArtifactFileCache,
|
15
15
|
get_artifact_file_cache,
|
16
16
|
)
|
17
|
+
from wandb.sdk.artifacts.staging import get_staging_dir
|
17
18
|
from wandb.sdk.artifacts.storage_handlers.azure_handler import AzureHandler
|
18
19
|
from wandb.sdk.artifacts.storage_handlers.gcs_handler import GCSHandler
|
19
20
|
from wandb.sdk.artifacts.storage_handlers.http_handler import HTTPHandler
|
@@ -28,6 +29,7 @@ from wandb.sdk.artifacts.storage_handlers.wb_local_artifact_handler import (
|
|
28
29
|
from wandb.sdk.artifacts.storage_layout import StorageLayout
|
29
30
|
from wandb.sdk.artifacts.storage_policies.register import WANDB_STORAGE_POLICY
|
30
31
|
from wandb.sdk.artifacts.storage_policy import StoragePolicy
|
32
|
+
from wandb.sdk.internal.internal_api import Api as InternalApi
|
31
33
|
from wandb.sdk.internal.thread_local_settings import _thread_local_api_settings
|
32
34
|
from wandb.sdk.lib.hashutil import B64MD5, b64_to_hex_id, hex_to_b64_id
|
33
35
|
from wandb.sdk.lib.paths import FilePathStr, URIStr
|
@@ -60,8 +62,10 @@ class WandbStoragePolicy(StoragePolicy):
|
|
60
62
|
return WANDB_STORAGE_POLICY
|
61
63
|
|
62
64
|
@classmethod
|
63
|
-
def from_config(
|
64
|
-
|
65
|
+
def from_config(
|
66
|
+
cls, config: Dict, api: Optional[InternalApi] = None
|
67
|
+
) -> "WandbStoragePolicy":
|
68
|
+
return cls(config=config, api=api)
|
65
69
|
|
66
70
|
def __init__(
|
67
71
|
self,
|
@@ -131,6 +135,7 @@ class WandbStoragePolicy(StoragePolicy):
|
|
131
135
|
if manifest_entry._download_url is None:
|
132
136
|
auth = None
|
133
137
|
if not _thread_local_api_settings.cookies:
|
138
|
+
assert self._api.api_key is not None
|
134
139
|
auth = ("api", self._api.api_key)
|
135
140
|
response = self._session.get(
|
136
141
|
self._file_url(self._api, artifact.entity, manifest_entry),
|
@@ -222,9 +227,10 @@ class WandbStoragePolicy(StoragePolicy):
|
|
222
227
|
extra_headers={
|
223
228
|
"content-md5": md5_b64_str,
|
224
229
|
"content-length": str(len(data)),
|
225
|
-
"content-type": extra_headers.get("Content-Type"),
|
230
|
+
"content-type": extra_headers.get("Content-Type", ""),
|
226
231
|
},
|
227
232
|
)
|
233
|
+
assert upload_resp is not None
|
228
234
|
etags.append(
|
229
235
|
{"partNumber": part_number, "hexMD5": upload_resp.headers["ETag"]}
|
230
236
|
)
|
@@ -311,7 +317,6 @@ class WandbStoragePolicy(StoragePolicy):
|
|
311
317
|
return True
|
312
318
|
if entry.local_path is None:
|
313
319
|
return False
|
314
|
-
|
315
320
|
extra_headers = {
|
316
321
|
header.split(":", 1)[0]: header.split(":", 1)[1]
|
317
322
|
for header in (resp.upload_headers or {})
|
@@ -333,6 +338,7 @@ class WandbStoragePolicy(StoragePolicy):
|
|
333
338
|
multipart_urls,
|
334
339
|
extra_headers,
|
335
340
|
)
|
341
|
+
assert resp.storage_path is not None
|
336
342
|
self._api.complete_multipart_upload_artifact(
|
337
343
|
artifact_id, resp.storage_path, etags, resp.upload_id
|
338
344
|
)
|
@@ -389,9 +395,16 @@ class WandbStoragePolicy(StoragePolicy):
|
|
389
395
|
B64MD5(entry.digest),
|
390
396
|
entry.size if entry.size is not None else 0,
|
391
397
|
)
|
392
|
-
|
393
|
-
|
398
|
+
|
399
|
+
staging_dir = get_staging_dir()
|
400
|
+
try:
|
401
|
+
if not entry.skip_cache and not hit:
|
394
402
|
with cache_open("wb") as f, open(entry.local_path, "rb") as src:
|
395
403
|
shutil.copyfileobj(src, f)
|
396
|
-
|
397
|
-
|
404
|
+
if entry.local_path.startswith(staging_dir):
|
405
|
+
# Delete staged files here instead of waiting till
|
406
|
+
# all the files are uploaded
|
407
|
+
os.chmod(entry.local_path, 0o600)
|
408
|
+
os.remove(entry.local_path)
|
409
|
+
except OSError as e:
|
410
|
+
termwarn(f"Failed to cache {entry.local_path}, ignoring {e}")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Storage policy."""
|
2
2
|
from typing import TYPE_CHECKING, Dict, Optional, Sequence, Type, Union
|
3
3
|
|
4
|
+
from wandb.sdk.internal.internal_api import Api as InternalApi
|
4
5
|
from wandb.sdk.lib.paths import FilePathStr, URIStr
|
5
6
|
|
6
7
|
if TYPE_CHECKING:
|
@@ -25,7 +26,9 @@ class StoragePolicy:
|
|
25
26
|
raise NotImplementedError
|
26
27
|
|
27
28
|
@classmethod
|
28
|
-
def from_config(
|
29
|
+
def from_config(
|
30
|
+
cls, config: Dict, api: Optional[InternalApi] = None
|
31
|
+
) -> "StoragePolicy":
|
29
32
|
raise NotImplementedError
|
30
33
|
|
31
34
|
def config(self) -> Dict:
|
@@ -11,7 +11,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
11
11
|
|
12
12
|
|
13
13
|
def _server_accepts_client_ids() -> bool:
|
14
|
-
from
|
14
|
+
from wandb.util import parse_version
|
15
15
|
|
16
16
|
# First, if we are offline, assume the backend server cannot
|
17
17
|
# accept client IDs. Unfortunately, this is the best we can do
|
wandb/sdk/data_types/image.py
CHANGED
@@ -42,7 +42,7 @@ def _server_accepts_image_filenames() -> bool:
|
|
42
42
|
max_cli_version = util._get_max_cli_version()
|
43
43
|
if max_cli_version is None:
|
44
44
|
return False
|
45
|
-
from
|
45
|
+
from wandb.util import parse_version
|
46
46
|
|
47
47
|
accepts_image_filenames: bool = parse_version("0.12.10") <= parse_version(
|
48
48
|
max_cli_version
|
@@ -51,7 +51,7 @@ def _server_accepts_image_filenames() -> bool:
|
|
51
51
|
|
52
52
|
|
53
53
|
def _server_accepts_artifact_path() -> bool:
|
54
|
-
from
|
54
|
+
from wandb.util import parse_version
|
55
55
|
|
56
56
|
target_version = "0.12.14"
|
57
57
|
max_cli_version = util._get_max_cli_version() if not util._is_offline() else None
|
wandb/sdk/interface/interface.py
CHANGED
@@ -13,7 +13,17 @@ import os
|
|
13
13
|
import sys
|
14
14
|
import time
|
15
15
|
from abc import abstractmethod
|
16
|
-
from typing import
|
16
|
+
from typing import (
|
17
|
+
TYPE_CHECKING,
|
18
|
+
Any,
|
19
|
+
Dict,
|
20
|
+
Iterable,
|
21
|
+
List,
|
22
|
+
NewType,
|
23
|
+
Optional,
|
24
|
+
Tuple,
|
25
|
+
Union,
|
26
|
+
)
|
17
27
|
|
18
28
|
from wandb.proto import wandb_internal_pb2 as pb
|
19
29
|
from wandb.proto import wandb_telemetry_pb2 as tpb
|
@@ -34,6 +44,7 @@ from wandb.util import (
|
|
34
44
|
from ..data_types.utils import history_dict_to_json, val_to_json
|
35
45
|
from ..lib.mailbox import MailboxHandle
|
36
46
|
from . import summary_record as sr
|
47
|
+
from .message_future import MessageFuture
|
37
48
|
|
38
49
|
GlobStr = NewType("GlobStr", str)
|
39
50
|
|
@@ -339,6 +350,7 @@ class InterfaceBase:
|
|
339
350
|
proto_entry.ref = entry.ref
|
340
351
|
if entry.local_path:
|
341
352
|
proto_entry.local_path = entry.local_path
|
353
|
+
proto_entry.skip_cache = entry.skip_cache
|
342
354
|
for k, v in entry.extra.items():
|
343
355
|
proto_extra = proto_entry.extra.add()
|
344
356
|
proto_extra.key = k
|
@@ -452,7 +464,7 @@ class InterfaceBase:
|
|
452
464
|
def _publish_use_artifact(self, proto_artifact: pb.UseArtifactRecord) -> None:
|
453
465
|
raise NotImplementedError
|
454
466
|
|
455
|
-
def
|
467
|
+
def communicate_artifact(
|
456
468
|
self,
|
457
469
|
run: "Run",
|
458
470
|
artifact: "Artifact",
|
@@ -461,7 +473,7 @@ class InterfaceBase:
|
|
461
473
|
is_user_created: bool = False,
|
462
474
|
use_after_commit: bool = False,
|
463
475
|
finalize: bool = True,
|
464
|
-
) ->
|
476
|
+
) -> MessageFuture:
|
465
477
|
proto_run = self._make_run(run)
|
466
478
|
proto_artifact = self._make_artifact(artifact)
|
467
479
|
proto_artifact.run_id = proto_run.run_id
|
@@ -478,11 +490,13 @@ class InterfaceBase:
|
|
478
490
|
if history_step is not None:
|
479
491
|
log_artifact.history_step = history_step
|
480
492
|
log_artifact.staging_dir = get_staging_dir()
|
481
|
-
resp = self.
|
493
|
+
resp = self._communicate_artifact(log_artifact)
|
482
494
|
return resp
|
483
495
|
|
484
496
|
@abstractmethod
|
485
|
-
def
|
497
|
+
def _communicate_artifact(
|
498
|
+
self, log_artifact: pb.LogArtifactRequest
|
499
|
+
) -> MessageFuture:
|
486
500
|
raise NotImplementedError
|
487
501
|
|
488
502
|
def deliver_download_artifact(
|
@@ -753,6 +767,36 @@ class InterfaceBase:
|
|
753
767
|
run_start.run.CopyFrom(run_pb)
|
754
768
|
return self._deliver_run_start(run_start)
|
755
769
|
|
770
|
+
def publish_launch_wandb_config_parameters(
|
771
|
+
self, include_paths: List[List[str]], exclude_paths: List[List[str]]
|
772
|
+
):
|
773
|
+
"""Tells the internal process to treat wandb.config fields as job inputs.
|
774
|
+
|
775
|
+
The paths provided as arguments are sequences of dictionary keys that
|
776
|
+
specify a path within the wandb.config. If a path is included, the
|
777
|
+
corresponding field will be treated as a job input. If a path is
|
778
|
+
excluded, the corresponding field will not be treated as a job input.
|
779
|
+
|
780
|
+
Args:
|
781
|
+
include_paths: paths within config to include as job inputs.
|
782
|
+
exclude_paths: paths within config to exclude as job inputs.
|
783
|
+
|
784
|
+
Returns:
|
785
|
+
None
|
786
|
+
"""
|
787
|
+
config_parameters = pb.LaunchWandbConfigParametersRecord()
|
788
|
+
include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
|
789
|
+
exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
|
790
|
+
config_parameters.include_paths.extend(include_records)
|
791
|
+
config_parameters.exclude_paths.extend(exclude_records)
|
792
|
+
return self._publish_launch_wandb_config_parameters(config_parameters)
|
793
|
+
|
794
|
+
@abstractmethod
|
795
|
+
def _publish_launch_wandb_config_parameters(
|
796
|
+
self, config_parameters: pb.LaunchWandbConfigParametersRecord
|
797
|
+
) -> None:
|
798
|
+
raise NotImplementedError
|
799
|
+
|
756
800
|
@abstractmethod
|
757
801
|
def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
|
758
802
|
raise NotImplementedError
|
@@ -868,11 +912,3 @@ class InterfaceBase:
|
|
868
912
|
self, run_status: pb.RunStatusRequest
|
869
913
|
) -> MailboxHandle:
|
870
914
|
raise NotImplementedError
|
871
|
-
|
872
|
-
def deliver_request_job_info(self) -> MailboxHandle:
|
873
|
-
job_info = pb.JobInfoRequest()
|
874
|
-
return self._deliver_request_job_info(job_info)
|
875
|
-
|
876
|
-
@abstractmethod
|
877
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
878
|
-
raise NotImplementedError
|
@@ -145,7 +145,6 @@ class InterfaceShared(InterfaceBase):
|
|
145
145
|
cancel: Optional[pb.CancelRequest] = None,
|
146
146
|
summary_record: Optional[pb.SummaryRecordRequest] = None,
|
147
147
|
telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
|
148
|
-
job_info: Optional[pb.JobInfoRequest] = None,
|
149
148
|
get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
|
150
149
|
python_packages: Optional[pb.PythonPackagesRequest] = None,
|
151
150
|
) -> pb.Record:
|
@@ -202,8 +201,6 @@ class InterfaceShared(InterfaceBase):
|
|
202
201
|
request.summary_record.CopyFrom(summary_record)
|
203
202
|
elif telemetry_record:
|
204
203
|
request.telemetry_record.CopyFrom(telemetry_record)
|
205
|
-
elif job_info:
|
206
|
-
request.job_info.CopyFrom(job_info)
|
207
204
|
elif get_system_metrics:
|
208
205
|
request.get_system_metrics.CopyFrom(get_system_metrics)
|
209
206
|
elif sync:
|
@@ -242,6 +239,9 @@ class InterfaceShared(InterfaceBase):
|
|
242
239
|
use_artifact: Optional[pb.UseArtifactRecord] = None,
|
243
240
|
output: Optional[pb.OutputRecord] = None,
|
244
241
|
output_raw: Optional[pb.OutputRawRecord] = None,
|
242
|
+
launch_wandb_config_parameters: Optional[
|
243
|
+
pb.LaunchWandbConfigParametersRecord
|
244
|
+
] = None,
|
245
245
|
) -> pb.Record:
|
246
246
|
record = pb.Record()
|
247
247
|
if run:
|
@@ -286,6 +286,8 @@ class InterfaceShared(InterfaceBase):
|
|
286
286
|
record.output.CopyFrom(output)
|
287
287
|
elif output_raw:
|
288
288
|
record.output_raw.CopyFrom(output_raw)
|
289
|
+
elif launch_wandb_config_parameters:
|
290
|
+
record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
|
289
291
|
else:
|
290
292
|
raise Exception("Invalid record")
|
291
293
|
return record
|
@@ -389,17 +391,17 @@ class InterfaceShared(InterfaceBase):
|
|
389
391
|
rec = self._make_record(files=files)
|
390
392
|
self._publish(rec)
|
391
393
|
|
392
|
-
def _publish_link_artifact(self, link_artifact: pb.LinkArtifactRecord) ->
|
394
|
+
def _publish_link_artifact(self, link_artifact: pb.LinkArtifactRecord) -> Any:
|
393
395
|
rec = self._make_record(link_artifact=link_artifact)
|
394
396
|
self._publish(rec)
|
395
397
|
|
396
|
-
def _publish_use_artifact(self, use_artifact: pb.UseArtifactRecord) ->
|
398
|
+
def _publish_use_artifact(self, use_artifact: pb.UseArtifactRecord) -> Any:
|
397
399
|
rec = self._make_record(use_artifact=use_artifact)
|
398
400
|
self._publish(rec)
|
399
401
|
|
400
|
-
def
|
402
|
+
def _communicate_artifact(self, log_artifact: pb.LogArtifactRequest) -> Any:
|
401
403
|
rec = self._make_request(log_artifact=log_artifact)
|
402
|
-
return self.
|
404
|
+
return self._communicate_async(rec)
|
403
405
|
|
404
406
|
def _deliver_download_artifact(
|
405
407
|
self, download_artifact: pb.DownloadArtifactRequest
|
@@ -415,6 +417,14 @@ class InterfaceShared(InterfaceBase):
|
|
415
417
|
rec = self._make_record(alert=proto_alert)
|
416
418
|
self._publish(rec)
|
417
419
|
|
420
|
+
def _publish_launch_wandb_config_parameters(
|
421
|
+
self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
|
422
|
+
) -> None:
|
423
|
+
rec = self._make_record(
|
424
|
+
launch_wandb_config_parameters=launch_wandb_config_parameters
|
425
|
+
)
|
426
|
+
self._publish(rec)
|
427
|
+
|
418
428
|
def _communicate_status(
|
419
429
|
self, status: pb.StatusRequest
|
420
430
|
) -> Optional[pb.StatusResponse]:
|
@@ -523,10 +533,6 @@ class InterfaceShared(InterfaceBase):
|
|
523
533
|
record = self._make_request(run_status=run_status)
|
524
534
|
return self._deliver_record(record)
|
525
535
|
|
526
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
527
|
-
record = self._make_request(job_info=job_info)
|
528
|
-
return self._deliver_record(record)
|
529
|
-
|
530
536
|
def _transport_keepalive_failed(self, keepalive_interval: int = 5) -> bool:
|
531
537
|
if self._transport_failed:
|
532
538
|
return True
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import base64
|
2
2
|
import functools
|
3
3
|
import itertools
|
4
|
+
import json
|
4
5
|
import logging
|
5
6
|
import os
|
6
7
|
import queue
|
@@ -58,6 +59,7 @@ class Chunk(NamedTuple):
|
|
58
59
|
class DefaultFilePolicy:
|
59
60
|
def __init__(self, start_chunk_id: int = 0) -> None:
|
60
61
|
self._chunk_id = start_chunk_id
|
62
|
+
self.has_debug_log = False
|
61
63
|
|
62
64
|
def process_chunks(
|
63
65
|
self, chunks: List[Chunk]
|
@@ -66,6 +68,21 @@ class DefaultFilePolicy:
|
|
66
68
|
self._chunk_id += len(chunks)
|
67
69
|
return {"offset": chunk_id, "content": [c.data for c in chunks]}
|
68
70
|
|
71
|
+
# TODO: this is very inefficient, this is meant for temporary debugging and will be removed in future releases
|
72
|
+
def _debug_log(self, data: Any):
|
73
|
+
if self.has_debug_log or not os.environ.get("WANDB_DEBUG_FILESTREAM_LOG"):
|
74
|
+
return
|
75
|
+
|
76
|
+
loaded = json.loads(data)
|
77
|
+
if not isinstance(loaded, dict):
|
78
|
+
return
|
79
|
+
|
80
|
+
# get key size and convert to MB
|
81
|
+
key_sizes = [(k, len(json.dumps(v))) for k, v in loaded.items()]
|
82
|
+
key_msg = [f"{k}: {v/1048576:.5f} MB" for k, v in key_sizes]
|
83
|
+
wandb.termerror(f"Step: {loaded['_step']} | {key_msg}", repeat=False)
|
84
|
+
self.has_debug_log = True
|
85
|
+
|
69
86
|
|
70
87
|
class JsonlFilePolicy(DefaultFilePolicy):
|
71
88
|
def process_chunks(self, chunks: List[Chunk]) -> "ProcessedChunk":
|
@@ -81,6 +98,7 @@ class JsonlFilePolicy(DefaultFilePolicy):
|
|
81
98
|
)
|
82
99
|
wandb.termerror(msg, repeat=False)
|
83
100
|
wandb._sentry.message(msg, repeat=False)
|
101
|
+
self._debug_log(chunk.data)
|
84
102
|
else:
|
85
103
|
chunk_data.append(chunk.data)
|
86
104
|
|
@@ -99,6 +117,7 @@ class SummaryFilePolicy(DefaultFilePolicy):
|
|
99
117
|
)
|
100
118
|
wandb.termerror(msg, repeat=False)
|
101
119
|
wandb._sentry.message(msg, repeat=False)
|
120
|
+
self._debug_log(data)
|
102
121
|
return False
|
103
122
|
return {"offset": 0, "content": [data]}
|
104
123
|
|
@@ -274,7 +293,7 @@ class CRDedupeFilePolicy(DefaultFilePolicy):
|
|
274
293
|
ret = []
|
275
294
|
for a, b in intervals:
|
276
295
|
processed_chunk: ProcessedChunk = {
|
277
|
-
"offset": a,
|
296
|
+
"offset": self._chunk_id + a,
|
278
297
|
"content": [console[i] for i in range(a, b + 1)],
|
279
298
|
}
|
280
299
|
ret.append(processed_chunk)
|
wandb/sdk/internal/handler.py
CHANGED
@@ -689,7 +689,7 @@ class HandleManager:
|
|
689
689
|
self._settings, interface=self._interface, run_proto=run_start.run
|
690
690
|
)
|
691
691
|
|
692
|
-
if run_start.run.resumed:
|
692
|
+
if run_start.run.resumed or run_start.run.forked:
|
693
693
|
self._step = run_start.run.starting_step
|
694
694
|
result = proto_util._result_from_record(record)
|
695
695
|
self._respond_result(result)
|
@@ -862,9 +862,6 @@ class HandleManager:
|
|
862
862
|
self._respond_result(result)
|
863
863
|
self._stopped.set()
|
864
864
|
|
865
|
-
def handle_request_job_info(self, record: Record) -> None:
|
866
|
-
self._dispatch_record(record, always_send=True)
|
867
|
-
|
868
865
|
def finish(self) -> None:
|
869
866
|
logger.info("shutting down handler")
|
870
867
|
if self._system_monitor is not None:
|
@@ -2150,6 +2150,7 @@ class Api:
|
|
2150
2150
|
name
|
2151
2151
|
}
|
2152
2152
|
}
|
2153
|
+
historyLineCount
|
2153
2154
|
}
|
2154
2155
|
inserted
|
2155
2156
|
_Server_Settings_
|
@@ -2237,6 +2238,7 @@ class Api:
|
|
2237
2238
|
.get("serverSettings", {})
|
2238
2239
|
.get("serverMessages", [])
|
2239
2240
|
)
|
2241
|
+
|
2240
2242
|
return (
|
2241
2243
|
response["upsertBucket"]["bucket"],
|
2242
2244
|
response["upsertBucket"]["inserted"],
|
@@ -3720,7 +3722,7 @@ class Api:
|
|
3720
3722
|
artifact_id: str,
|
3721
3723
|
storage_path: str,
|
3722
3724
|
completed_parts: List[Dict[str, Any]],
|
3723
|
-
upload_id: str,
|
3725
|
+
upload_id: Optional[str],
|
3724
3726
|
complete_multipart_action: str = "Complete",
|
3725
3727
|
) -> Optional[str]:
|
3726
3728
|
mutation = gql(
|
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import re
|
6
6
|
import sys
|
7
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
import wandb
|
10
10
|
from wandb.sdk.artifacts.artifact import Artifact
|
@@ -28,6 +28,8 @@ FROZEN_REQUIREMENTS_FNAME = "requirements.frozen.txt"
|
|
28
28
|
JOB_FNAME = "wandb-job.json"
|
29
29
|
JOB_ARTIFACT_TYPE = "job"
|
30
30
|
|
31
|
+
LOG_LEVEL = Literal["log", "warn", "error"]
|
32
|
+
|
31
33
|
|
32
34
|
class GitInfo(TypedDict):
|
33
35
|
remote: str
|
@@ -89,8 +91,9 @@ class JobBuilder:
|
|
89
91
|
_job_seq_id: Optional[str]
|
90
92
|
_job_version_alias: Optional[str]
|
91
93
|
_is_notebook_run: bool
|
94
|
+
_verbose: bool
|
92
95
|
|
93
|
-
def __init__(self, settings: SettingsStatic):
|
96
|
+
def __init__(self, settings: SettingsStatic, verbose: bool = False):
|
94
97
|
self._settings = settings
|
95
98
|
self._metadatafile_path = None
|
96
99
|
self._requirements_path = None
|
@@ -106,6 +109,7 @@ class JobBuilder:
|
|
106
109
|
Literal["repo", "artifact", "image"]
|
107
110
|
] = settings.job_source # type: ignore[assignment]
|
108
111
|
self._is_notebook_run = self._get_is_notebook_run()
|
112
|
+
self._verbose = verbose
|
109
113
|
|
110
114
|
def set_config(self, config: Dict[str, Any]) -> None:
|
111
115
|
self._config = config
|
@@ -121,7 +125,9 @@ class JobBuilder:
|
|
121
125
|
def disable(self, val: bool) -> None:
|
122
126
|
self._disable = val
|
123
127
|
|
124
|
-
def _handle_server_artifact(
|
128
|
+
def _handle_server_artifact(
|
129
|
+
self, res: Optional[Dict], artifact: "ArtifactRecord"
|
130
|
+
) -> None:
|
125
131
|
if artifact.type == "job" and res is not None:
|
126
132
|
try:
|
127
133
|
if res["artifactSequence"]["latestArtifact"] is None:
|
@@ -135,7 +141,7 @@ class JobBuilder:
|
|
135
141
|
self._job_seq_id = res["artifactSequence"]["id"]
|
136
142
|
except KeyError as e:
|
137
143
|
_logger.info(f"Malformed response from ArtifactSaver.save {e}")
|
138
|
-
if artifact.type == "code" and
|
144
|
+
if artifact.type == "code" and res is not None:
|
139
145
|
self._logged_code_artifact = ArtifactInfoForJob(
|
140
146
|
{
|
141
147
|
"id": res["id"],
|
@@ -195,6 +201,21 @@ class JobBuilder:
|
|
195
201
|
|
196
202
|
return source, name
|
197
203
|
|
204
|
+
def _log_if_verbose(self, message: str, level: LOG_LEVEL) -> None:
|
205
|
+
log_func: Optional[Union[Callable[[Any], None], Callable[[Any], None]]] = None
|
206
|
+
if level == "log":
|
207
|
+
_logger.info(message)
|
208
|
+
log_func = wandb.termlog
|
209
|
+
elif level == "warn":
|
210
|
+
_logger.warning(message)
|
211
|
+
log_func = wandb.termwarn
|
212
|
+
elif level == "error":
|
213
|
+
_logger.error(message)
|
214
|
+
log_func = wandb.termerror
|
215
|
+
|
216
|
+
if self._verbose and log_func is not None:
|
217
|
+
log_func(message)
|
218
|
+
|
198
219
|
def _build_artifact_job_source(
|
199
220
|
self,
|
200
221
|
program_relpath: str,
|
@@ -210,8 +231,9 @@ class JobBuilder:
|
|
210
231
|
# at the directory the notebook is in instead of the jupyter core
|
211
232
|
if not os.path.exists(os.path.basename(program_relpath)):
|
212
233
|
_logger.info("target path does not exist, exiting")
|
213
|
-
|
214
|
-
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job"
|
234
|
+
self._log_if_verbose(
|
235
|
+
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job",
|
236
|
+
"warn",
|
215
237
|
)
|
216
238
|
return None, None
|
217
239
|
full_program_relpath = os.path.basename(program_relpath)
|
@@ -297,22 +319,25 @@ class JobBuilder:
|
|
297
319
|
if not os.path.exists(
|
298
320
|
os.path.join(self._settings.files_dir, REQUIREMENTS_FNAME)
|
299
321
|
):
|
300
|
-
|
301
|
-
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
322
|
+
self._log_if_verbose(
|
323
|
+
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
324
|
+
"warn",
|
302
325
|
)
|
303
326
|
return None
|
304
327
|
metadata = self._handle_metadata_file()
|
305
328
|
if metadata is None:
|
306
|
-
|
307
|
-
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables"
|
329
|
+
self._log_if_verbose(
|
330
|
+
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables",
|
331
|
+
"warn",
|
308
332
|
)
|
309
333
|
return None
|
310
334
|
|
311
335
|
runtime: Optional[str] = metadata.get("python")
|
312
336
|
# can't build a job without a python version
|
313
337
|
if runtime is None:
|
314
|
-
|
315
|
-
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
338
|
+
self._log_if_verbose(
|
339
|
+
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
340
|
+
"warn",
|
316
341
|
)
|
317
342
|
return None
|
318
343
|
|
@@ -343,13 +368,16 @@ class JobBuilder:
|
|
343
368
|
or self._settings.job_source
|
344
369
|
or self._source_type
|
345
370
|
):
|
346
|
-
|
371
|
+
self._log_if_verbose(
|
372
|
+
"No source type found, not creating job artifact", "warn"
|
373
|
+
)
|
347
374
|
return None
|
348
375
|
|
349
376
|
program_relpath = self._get_program_relpath(source_type, metadata)
|
350
377
|
if source_type != "image" and not program_relpath:
|
351
|
-
|
352
|
-
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
378
|
+
self._log_if_verbose(
|
379
|
+
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
380
|
+
"warn",
|
353
381
|
)
|
354
382
|
return None
|
355
383
|
|
@@ -375,10 +403,11 @@ class JobBuilder:
|
|
375
403
|
|
376
404
|
if source is None:
|
377
405
|
if source_type:
|
378
|
-
|
406
|
+
self._log_if_verbose(
|
379
407
|
f"Source type is set to '{source_type}' but some required information is missing "
|
380
408
|
"from the environment. A job will not be created from this run. See "
|
381
|
-
"https://docs.wandb.ai/guides/launch/create-job"
|
409
|
+
"https://docs.wandb.ai/guides/launch/create-job",
|
410
|
+
"warn",
|
382
411
|
)
|
383
412
|
return None
|
384
413
|
|
@@ -445,8 +474,9 @@ class JobBuilder:
|
|
445
474
|
program = metadata.get("program")
|
446
475
|
|
447
476
|
if not program:
|
448
|
-
|
449
|
-
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job"
|
477
|
+
self._log_if_verbose(
|
478
|
+
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job",
|
479
|
+
"warn",
|
450
480
|
)
|
451
481
|
|
452
482
|
return program
|
wandb/sdk/internal/profiler.py
CHANGED
@@ -52,7 +52,7 @@ def torch_trace_handler():
|
|
52
52
|
prof.step()
|
53
53
|
```
|
54
54
|
"""
|
55
|
-
from
|
55
|
+
from wandb.util import parse_version
|
56
56
|
|
57
57
|
torch = wandb.util.get_module(PYTORCH_MODULE, required=True)
|
58
58
|
torch_profiler = wandb.util.get_module(PYTORCH_PROFILER_MODULE, required=True)
|