wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/importers/__init__.py +1 -4
- wandb/apis/importers/internals/internal.py +386 -0
- wandb/apis/importers/internals/protocols.py +125 -0
- wandb/apis/importers/internals/util.py +78 -0
- wandb/apis/importers/mlflow.py +125 -88
- wandb/apis/importers/validation.py +108 -0
- wandb/apis/importers/wandb.py +1604 -0
- wandb/apis/public/api.py +7 -10
- wandb/apis/public/artifacts.py +38 -0
- wandb/apis/public/files.py +11 -2
- wandb/apis/reports/v2/__init__.py +0 -19
- wandb/apis/reports/v2/expr_parsing.py +0 -1
- wandb/apis/reports/v2/interface.py +15 -18
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +52 -55
- wandb/integration/gym/__init__.py +2 -1
- wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
- wandb/integration/keras/keras.py +6 -4
- wandb/integration/kfp/kfp_patch.py +2 -2
- wandb/integration/openai/fine_tuning.py +1 -2
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +75 -31
- wandb/sdk/artifacts/artifact_manifest.py +5 -2
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
- wandb/sdk/artifacts/artifact_saver.py +19 -47
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
- wandb/sdk/artifacts/storage_policy.py +4 -1
- wandb/sdk/data_types/base_types/wb_value.py +1 -1
- wandb/sdk/data_types/image.py +2 -2
- wandb/sdk/interface/interface.py +49 -13
- wandb/sdk/interface/interface_shared.py +17 -11
- wandb/sdk/internal/file_stream.py +20 -1
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +3 -1
- wandb/sdk/internal/job_builder.py +49 -19
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/sender.py +96 -124
- wandb/sdk/internal/sender_config.py +197 -0
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +5 -3
- wandb/sdk/internal/update.py +1 -1
- wandb/sdk/launch/_launch.py +3 -3
- wandb/sdk/launch/_launch_add.py +28 -29
- wandb/sdk/launch/_project_spec.py +148 -136
- wandb/sdk/launch/agent/agent.py +3 -7
- wandb/sdk/launch/agent/config.py +0 -27
- wandb/sdk/launch/builder/build.py +54 -28
- wandb/sdk/launch/builder/docker_builder.py +4 -15
- wandb/sdk/launch/builder/kaniko_builder.py +72 -45
- wandb/sdk/launch/create_job.py +6 -40
- wandb/sdk/launch/loader.py +10 -0
- wandb/sdk/launch/registry/anon.py +29 -0
- wandb/sdk/launch/registry/local_registry.py +4 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/runner/local_container.py +15 -10
- wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
- wandb/sdk/launch/sweeps/scheduler.py +11 -3
- wandb/sdk/launch/utils.py +14 -0
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +4 -1
- wandb/sdk/lib/apikey.py +0 -5
- wandb/sdk/lib/config_util.py +0 -31
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +72 -0
- wandb/sdk/service/service.py +7 -2
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_init.py +12 -1
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +164 -110
- wandb/sdk/wandb_settings.py +58 -16
- wandb/testing/relay.py +5 -6
- wandb/util.py +50 -7
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
- wandb/apis/importers/base.py +0 -400
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
"""WandB storage policy."""
|
2
2
|
import hashlib
|
3
3
|
import math
|
4
|
+
import os
|
4
5
|
import shutil
|
5
6
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
|
6
7
|
from urllib.parse import quote
|
@@ -8,12 +9,12 @@ from urllib.parse import quote
|
|
8
9
|
import requests
|
9
10
|
import urllib3
|
10
11
|
|
11
|
-
from wandb.apis import InternalApi
|
12
12
|
from wandb.errors.term import termwarn
|
13
13
|
from wandb.sdk.artifacts.artifact_file_cache import (
|
14
14
|
ArtifactFileCache,
|
15
15
|
get_artifact_file_cache,
|
16
16
|
)
|
17
|
+
from wandb.sdk.artifacts.staging import get_staging_dir
|
17
18
|
from wandb.sdk.artifacts.storage_handlers.azure_handler import AzureHandler
|
18
19
|
from wandb.sdk.artifacts.storage_handlers.gcs_handler import GCSHandler
|
19
20
|
from wandb.sdk.artifacts.storage_handlers.http_handler import HTTPHandler
|
@@ -28,6 +29,7 @@ from wandb.sdk.artifacts.storage_handlers.wb_local_artifact_handler import (
|
|
28
29
|
from wandb.sdk.artifacts.storage_layout import StorageLayout
|
29
30
|
from wandb.sdk.artifacts.storage_policies.register import WANDB_STORAGE_POLICY
|
30
31
|
from wandb.sdk.artifacts.storage_policy import StoragePolicy
|
32
|
+
from wandb.sdk.internal.internal_api import Api as InternalApi
|
31
33
|
from wandb.sdk.internal.thread_local_settings import _thread_local_api_settings
|
32
34
|
from wandb.sdk.lib.hashutil import B64MD5, b64_to_hex_id, hex_to_b64_id
|
33
35
|
from wandb.sdk.lib.paths import FilePathStr, URIStr
|
@@ -60,8 +62,10 @@ class WandbStoragePolicy(StoragePolicy):
|
|
60
62
|
return WANDB_STORAGE_POLICY
|
61
63
|
|
62
64
|
@classmethod
|
63
|
-
def from_config(
|
64
|
-
|
65
|
+
def from_config(
|
66
|
+
cls, config: Dict, api: Optional[InternalApi] = None
|
67
|
+
) -> "WandbStoragePolicy":
|
68
|
+
return cls(config=config, api=api)
|
65
69
|
|
66
70
|
def __init__(
|
67
71
|
self,
|
@@ -131,6 +135,7 @@ class WandbStoragePolicy(StoragePolicy):
|
|
131
135
|
if manifest_entry._download_url is None:
|
132
136
|
auth = None
|
133
137
|
if not _thread_local_api_settings.cookies:
|
138
|
+
assert self._api.api_key is not None
|
134
139
|
auth = ("api", self._api.api_key)
|
135
140
|
response = self._session.get(
|
136
141
|
self._file_url(self._api, artifact.entity, manifest_entry),
|
@@ -222,9 +227,10 @@ class WandbStoragePolicy(StoragePolicy):
|
|
222
227
|
extra_headers={
|
223
228
|
"content-md5": md5_b64_str,
|
224
229
|
"content-length": str(len(data)),
|
225
|
-
"content-type": extra_headers.get("Content-Type"),
|
230
|
+
"content-type": extra_headers.get("Content-Type", ""),
|
226
231
|
},
|
227
232
|
)
|
233
|
+
assert upload_resp is not None
|
228
234
|
etags.append(
|
229
235
|
{"partNumber": part_number, "hexMD5": upload_resp.headers["ETag"]}
|
230
236
|
)
|
@@ -311,7 +317,6 @@ class WandbStoragePolicy(StoragePolicy):
|
|
311
317
|
return True
|
312
318
|
if entry.local_path is None:
|
313
319
|
return False
|
314
|
-
|
315
320
|
extra_headers = {
|
316
321
|
header.split(":", 1)[0]: header.split(":", 1)[1]
|
317
322
|
for header in (resp.upload_headers or {})
|
@@ -333,6 +338,7 @@ class WandbStoragePolicy(StoragePolicy):
|
|
333
338
|
multipart_urls,
|
334
339
|
extra_headers,
|
335
340
|
)
|
341
|
+
assert resp.storage_path is not None
|
336
342
|
self._api.complete_multipart_upload_artifact(
|
337
343
|
artifact_id, resp.storage_path, etags, resp.upload_id
|
338
344
|
)
|
@@ -389,9 +395,16 @@ class WandbStoragePolicy(StoragePolicy):
|
|
389
395
|
B64MD5(entry.digest),
|
390
396
|
entry.size if entry.size is not None else 0,
|
391
397
|
)
|
392
|
-
|
393
|
-
|
398
|
+
|
399
|
+
staging_dir = get_staging_dir()
|
400
|
+
try:
|
401
|
+
if not entry.skip_cache and not hit:
|
394
402
|
with cache_open("wb") as f, open(entry.local_path, "rb") as src:
|
395
403
|
shutil.copyfileobj(src, f)
|
396
|
-
|
397
|
-
|
404
|
+
if entry.local_path.startswith(staging_dir):
|
405
|
+
# Delete staged files here instead of waiting till
|
406
|
+
# all the files are uploaded
|
407
|
+
os.chmod(entry.local_path, 0o600)
|
408
|
+
os.remove(entry.local_path)
|
409
|
+
except OSError as e:
|
410
|
+
termwarn(f"Failed to cache {entry.local_path}, ignoring {e}")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Storage policy."""
|
2
2
|
from typing import TYPE_CHECKING, Dict, Optional, Sequence, Type, Union
|
3
3
|
|
4
|
+
from wandb.sdk.internal.internal_api import Api as InternalApi
|
4
5
|
from wandb.sdk.lib.paths import FilePathStr, URIStr
|
5
6
|
|
6
7
|
if TYPE_CHECKING:
|
@@ -25,7 +26,9 @@ class StoragePolicy:
|
|
25
26
|
raise NotImplementedError
|
26
27
|
|
27
28
|
@classmethod
|
28
|
-
def from_config(
|
29
|
+
def from_config(
|
30
|
+
cls, config: Dict, api: Optional[InternalApi] = None
|
31
|
+
) -> "StoragePolicy":
|
29
32
|
raise NotImplementedError
|
30
33
|
|
31
34
|
def config(self) -> Dict:
|
@@ -11,7 +11,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
11
11
|
|
12
12
|
|
13
13
|
def _server_accepts_client_ids() -> bool:
|
14
|
-
from
|
14
|
+
from wandb.util import parse_version
|
15
15
|
|
16
16
|
# First, if we are offline, assume the backend server cannot
|
17
17
|
# accept client IDs. Unfortunately, this is the best we can do
|
wandb/sdk/data_types/image.py
CHANGED
@@ -42,7 +42,7 @@ def _server_accepts_image_filenames() -> bool:
|
|
42
42
|
max_cli_version = util._get_max_cli_version()
|
43
43
|
if max_cli_version is None:
|
44
44
|
return False
|
45
|
-
from
|
45
|
+
from wandb.util import parse_version
|
46
46
|
|
47
47
|
accepts_image_filenames: bool = parse_version("0.12.10") <= parse_version(
|
48
48
|
max_cli_version
|
@@ -51,7 +51,7 @@ def _server_accepts_image_filenames() -> bool:
|
|
51
51
|
|
52
52
|
|
53
53
|
def _server_accepts_artifact_path() -> bool:
|
54
|
-
from
|
54
|
+
from wandb.util import parse_version
|
55
55
|
|
56
56
|
target_version = "0.12.14"
|
57
57
|
max_cli_version = util._get_max_cli_version() if not util._is_offline() else None
|
wandb/sdk/interface/interface.py
CHANGED
@@ -13,7 +13,17 @@ import os
|
|
13
13
|
import sys
|
14
14
|
import time
|
15
15
|
from abc import abstractmethod
|
16
|
-
from typing import
|
16
|
+
from typing import (
|
17
|
+
TYPE_CHECKING,
|
18
|
+
Any,
|
19
|
+
Dict,
|
20
|
+
Iterable,
|
21
|
+
List,
|
22
|
+
NewType,
|
23
|
+
Optional,
|
24
|
+
Tuple,
|
25
|
+
Union,
|
26
|
+
)
|
17
27
|
|
18
28
|
from wandb.proto import wandb_internal_pb2 as pb
|
19
29
|
from wandb.proto import wandb_telemetry_pb2 as tpb
|
@@ -34,6 +44,7 @@ from wandb.util import (
|
|
34
44
|
from ..data_types.utils import history_dict_to_json, val_to_json
|
35
45
|
from ..lib.mailbox import MailboxHandle
|
36
46
|
from . import summary_record as sr
|
47
|
+
from .message_future import MessageFuture
|
37
48
|
|
38
49
|
GlobStr = NewType("GlobStr", str)
|
39
50
|
|
@@ -339,6 +350,7 @@ class InterfaceBase:
|
|
339
350
|
proto_entry.ref = entry.ref
|
340
351
|
if entry.local_path:
|
341
352
|
proto_entry.local_path = entry.local_path
|
353
|
+
proto_entry.skip_cache = entry.skip_cache
|
342
354
|
for k, v in entry.extra.items():
|
343
355
|
proto_extra = proto_entry.extra.add()
|
344
356
|
proto_extra.key = k
|
@@ -452,7 +464,7 @@ class InterfaceBase:
|
|
452
464
|
def _publish_use_artifact(self, proto_artifact: pb.UseArtifactRecord) -> None:
|
453
465
|
raise NotImplementedError
|
454
466
|
|
455
|
-
def
|
467
|
+
def communicate_artifact(
|
456
468
|
self,
|
457
469
|
run: "Run",
|
458
470
|
artifact: "Artifact",
|
@@ -461,7 +473,7 @@ class InterfaceBase:
|
|
461
473
|
is_user_created: bool = False,
|
462
474
|
use_after_commit: bool = False,
|
463
475
|
finalize: bool = True,
|
464
|
-
) ->
|
476
|
+
) -> MessageFuture:
|
465
477
|
proto_run = self._make_run(run)
|
466
478
|
proto_artifact = self._make_artifact(artifact)
|
467
479
|
proto_artifact.run_id = proto_run.run_id
|
@@ -478,11 +490,13 @@ class InterfaceBase:
|
|
478
490
|
if history_step is not None:
|
479
491
|
log_artifact.history_step = history_step
|
480
492
|
log_artifact.staging_dir = get_staging_dir()
|
481
|
-
resp = self.
|
493
|
+
resp = self._communicate_artifact(log_artifact)
|
482
494
|
return resp
|
483
495
|
|
484
496
|
@abstractmethod
|
485
|
-
def
|
497
|
+
def _communicate_artifact(
|
498
|
+
self, log_artifact: pb.LogArtifactRequest
|
499
|
+
) -> MessageFuture:
|
486
500
|
raise NotImplementedError
|
487
501
|
|
488
502
|
def deliver_download_artifact(
|
@@ -753,6 +767,36 @@ class InterfaceBase:
|
|
753
767
|
run_start.run.CopyFrom(run_pb)
|
754
768
|
return self._deliver_run_start(run_start)
|
755
769
|
|
770
|
+
def publish_launch_wandb_config_parameters(
|
771
|
+
self, include_paths: List[List[str]], exclude_paths: List[List[str]]
|
772
|
+
):
|
773
|
+
"""Tells the internal process to treat wandb.config fields as job inputs.
|
774
|
+
|
775
|
+
The paths provided as arguments are sequences of dictionary keys that
|
776
|
+
specify a path within the wandb.config. If a path is included, the
|
777
|
+
corresponding field will be treated as a job input. If a path is
|
778
|
+
excluded, the corresponding field will not be treated as a job input.
|
779
|
+
|
780
|
+
Args:
|
781
|
+
include_paths: paths within config to include as job inputs.
|
782
|
+
exclude_paths: paths within config to exclude as job inputs.
|
783
|
+
|
784
|
+
Returns:
|
785
|
+
None
|
786
|
+
"""
|
787
|
+
config_parameters = pb.LaunchWandbConfigParametersRecord()
|
788
|
+
include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
|
789
|
+
exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
|
790
|
+
config_parameters.include_paths.extend(include_records)
|
791
|
+
config_parameters.exclude_paths.extend(exclude_records)
|
792
|
+
return self._publish_launch_wandb_config_parameters(config_parameters)
|
793
|
+
|
794
|
+
@abstractmethod
|
795
|
+
def _publish_launch_wandb_config_parameters(
|
796
|
+
self, config_parameters: pb.LaunchWandbConfigParametersRecord
|
797
|
+
) -> None:
|
798
|
+
raise NotImplementedError
|
799
|
+
|
756
800
|
@abstractmethod
|
757
801
|
def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
|
758
802
|
raise NotImplementedError
|
@@ -868,11 +912,3 @@ class InterfaceBase:
|
|
868
912
|
self, run_status: pb.RunStatusRequest
|
869
913
|
) -> MailboxHandle:
|
870
914
|
raise NotImplementedError
|
871
|
-
|
872
|
-
def deliver_request_job_info(self) -> MailboxHandle:
|
873
|
-
job_info = pb.JobInfoRequest()
|
874
|
-
return self._deliver_request_job_info(job_info)
|
875
|
-
|
876
|
-
@abstractmethod
|
877
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
878
|
-
raise NotImplementedError
|
@@ -145,7 +145,6 @@ class InterfaceShared(InterfaceBase):
|
|
145
145
|
cancel: Optional[pb.CancelRequest] = None,
|
146
146
|
summary_record: Optional[pb.SummaryRecordRequest] = None,
|
147
147
|
telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
|
148
|
-
job_info: Optional[pb.JobInfoRequest] = None,
|
149
148
|
get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
|
150
149
|
python_packages: Optional[pb.PythonPackagesRequest] = None,
|
151
150
|
) -> pb.Record:
|
@@ -202,8 +201,6 @@ class InterfaceShared(InterfaceBase):
|
|
202
201
|
request.summary_record.CopyFrom(summary_record)
|
203
202
|
elif telemetry_record:
|
204
203
|
request.telemetry_record.CopyFrom(telemetry_record)
|
205
|
-
elif job_info:
|
206
|
-
request.job_info.CopyFrom(job_info)
|
207
204
|
elif get_system_metrics:
|
208
205
|
request.get_system_metrics.CopyFrom(get_system_metrics)
|
209
206
|
elif sync:
|
@@ -242,6 +239,9 @@ class InterfaceShared(InterfaceBase):
|
|
242
239
|
use_artifact: Optional[pb.UseArtifactRecord] = None,
|
243
240
|
output: Optional[pb.OutputRecord] = None,
|
244
241
|
output_raw: Optional[pb.OutputRawRecord] = None,
|
242
|
+
launch_wandb_config_parameters: Optional[
|
243
|
+
pb.LaunchWandbConfigParametersRecord
|
244
|
+
] = None,
|
245
245
|
) -> pb.Record:
|
246
246
|
record = pb.Record()
|
247
247
|
if run:
|
@@ -286,6 +286,8 @@ class InterfaceShared(InterfaceBase):
|
|
286
286
|
record.output.CopyFrom(output)
|
287
287
|
elif output_raw:
|
288
288
|
record.output_raw.CopyFrom(output_raw)
|
289
|
+
elif launch_wandb_config_parameters:
|
290
|
+
record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
|
289
291
|
else:
|
290
292
|
raise Exception("Invalid record")
|
291
293
|
return record
|
@@ -389,17 +391,17 @@ class InterfaceShared(InterfaceBase):
|
|
389
391
|
rec = self._make_record(files=files)
|
390
392
|
self._publish(rec)
|
391
393
|
|
392
|
-
def _publish_link_artifact(self, link_artifact: pb.LinkArtifactRecord) ->
|
394
|
+
def _publish_link_artifact(self, link_artifact: pb.LinkArtifactRecord) -> Any:
|
393
395
|
rec = self._make_record(link_artifact=link_artifact)
|
394
396
|
self._publish(rec)
|
395
397
|
|
396
|
-
def _publish_use_artifact(self, use_artifact: pb.UseArtifactRecord) ->
|
398
|
+
def _publish_use_artifact(self, use_artifact: pb.UseArtifactRecord) -> Any:
|
397
399
|
rec = self._make_record(use_artifact=use_artifact)
|
398
400
|
self._publish(rec)
|
399
401
|
|
400
|
-
def
|
402
|
+
def _communicate_artifact(self, log_artifact: pb.LogArtifactRequest) -> Any:
|
401
403
|
rec = self._make_request(log_artifact=log_artifact)
|
402
|
-
return self.
|
404
|
+
return self._communicate_async(rec)
|
403
405
|
|
404
406
|
def _deliver_download_artifact(
|
405
407
|
self, download_artifact: pb.DownloadArtifactRequest
|
@@ -415,6 +417,14 @@ class InterfaceShared(InterfaceBase):
|
|
415
417
|
rec = self._make_record(alert=proto_alert)
|
416
418
|
self._publish(rec)
|
417
419
|
|
420
|
+
def _publish_launch_wandb_config_parameters(
|
421
|
+
self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
|
422
|
+
) -> None:
|
423
|
+
rec = self._make_record(
|
424
|
+
launch_wandb_config_parameters=launch_wandb_config_parameters
|
425
|
+
)
|
426
|
+
self._publish(rec)
|
427
|
+
|
418
428
|
def _communicate_status(
|
419
429
|
self, status: pb.StatusRequest
|
420
430
|
) -> Optional[pb.StatusResponse]:
|
@@ -523,10 +533,6 @@ class InterfaceShared(InterfaceBase):
|
|
523
533
|
record = self._make_request(run_status=run_status)
|
524
534
|
return self._deliver_record(record)
|
525
535
|
|
526
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
527
|
-
record = self._make_request(job_info=job_info)
|
528
|
-
return self._deliver_record(record)
|
529
|
-
|
530
536
|
def _transport_keepalive_failed(self, keepalive_interval: int = 5) -> bool:
|
531
537
|
if self._transport_failed:
|
532
538
|
return True
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import base64
|
2
2
|
import functools
|
3
3
|
import itertools
|
4
|
+
import json
|
4
5
|
import logging
|
5
6
|
import os
|
6
7
|
import queue
|
@@ -58,6 +59,7 @@ class Chunk(NamedTuple):
|
|
58
59
|
class DefaultFilePolicy:
|
59
60
|
def __init__(self, start_chunk_id: int = 0) -> None:
|
60
61
|
self._chunk_id = start_chunk_id
|
62
|
+
self.has_debug_log = False
|
61
63
|
|
62
64
|
def process_chunks(
|
63
65
|
self, chunks: List[Chunk]
|
@@ -66,6 +68,21 @@ class DefaultFilePolicy:
|
|
66
68
|
self._chunk_id += len(chunks)
|
67
69
|
return {"offset": chunk_id, "content": [c.data for c in chunks]}
|
68
70
|
|
71
|
+
# TODO: this is very inefficient, this is meant for temporary debugging and will be removed in future releases
|
72
|
+
def _debug_log(self, data: Any):
|
73
|
+
if self.has_debug_log or not os.environ.get("WANDB_DEBUG_FILESTREAM_LOG"):
|
74
|
+
return
|
75
|
+
|
76
|
+
loaded = json.loads(data)
|
77
|
+
if not isinstance(loaded, dict):
|
78
|
+
return
|
79
|
+
|
80
|
+
# get key size and convert to MB
|
81
|
+
key_sizes = [(k, len(json.dumps(v))) for k, v in loaded.items()]
|
82
|
+
key_msg = [f"{k}: {v/1048576:.5f} MB" for k, v in key_sizes]
|
83
|
+
wandb.termerror(f"Step: {loaded['_step']} | {key_msg}", repeat=False)
|
84
|
+
self.has_debug_log = True
|
85
|
+
|
69
86
|
|
70
87
|
class JsonlFilePolicy(DefaultFilePolicy):
|
71
88
|
def process_chunks(self, chunks: List[Chunk]) -> "ProcessedChunk":
|
@@ -81,6 +98,7 @@ class JsonlFilePolicy(DefaultFilePolicy):
|
|
81
98
|
)
|
82
99
|
wandb.termerror(msg, repeat=False)
|
83
100
|
wandb._sentry.message(msg, repeat=False)
|
101
|
+
self._debug_log(chunk.data)
|
84
102
|
else:
|
85
103
|
chunk_data.append(chunk.data)
|
86
104
|
|
@@ -99,6 +117,7 @@ class SummaryFilePolicy(DefaultFilePolicy):
|
|
99
117
|
)
|
100
118
|
wandb.termerror(msg, repeat=False)
|
101
119
|
wandb._sentry.message(msg, repeat=False)
|
120
|
+
self._debug_log(data)
|
102
121
|
return False
|
103
122
|
return {"offset": 0, "content": [data]}
|
104
123
|
|
@@ -274,7 +293,7 @@ class CRDedupeFilePolicy(DefaultFilePolicy):
|
|
274
293
|
ret = []
|
275
294
|
for a, b in intervals:
|
276
295
|
processed_chunk: ProcessedChunk = {
|
277
|
-
"offset": a,
|
296
|
+
"offset": self._chunk_id + a,
|
278
297
|
"content": [console[i] for i in range(a, b + 1)],
|
279
298
|
}
|
280
299
|
ret.append(processed_chunk)
|
wandb/sdk/internal/handler.py
CHANGED
@@ -689,7 +689,7 @@ class HandleManager:
|
|
689
689
|
self._settings, interface=self._interface, run_proto=run_start.run
|
690
690
|
)
|
691
691
|
|
692
|
-
if run_start.run.resumed:
|
692
|
+
if run_start.run.resumed or run_start.run.forked:
|
693
693
|
self._step = run_start.run.starting_step
|
694
694
|
result = proto_util._result_from_record(record)
|
695
695
|
self._respond_result(result)
|
@@ -862,9 +862,6 @@ class HandleManager:
|
|
862
862
|
self._respond_result(result)
|
863
863
|
self._stopped.set()
|
864
864
|
|
865
|
-
def handle_request_job_info(self, record: Record) -> None:
|
866
|
-
self._dispatch_record(record, always_send=True)
|
867
|
-
|
868
865
|
def finish(self) -> None:
|
869
866
|
logger.info("shutting down handler")
|
870
867
|
if self._system_monitor is not None:
|
@@ -2150,6 +2150,7 @@ class Api:
|
|
2150
2150
|
name
|
2151
2151
|
}
|
2152
2152
|
}
|
2153
|
+
historyLineCount
|
2153
2154
|
}
|
2154
2155
|
inserted
|
2155
2156
|
_Server_Settings_
|
@@ -2237,6 +2238,7 @@ class Api:
|
|
2237
2238
|
.get("serverSettings", {})
|
2238
2239
|
.get("serverMessages", [])
|
2239
2240
|
)
|
2241
|
+
|
2240
2242
|
return (
|
2241
2243
|
response["upsertBucket"]["bucket"],
|
2242
2244
|
response["upsertBucket"]["inserted"],
|
@@ -3720,7 +3722,7 @@ class Api:
|
|
3720
3722
|
artifact_id: str,
|
3721
3723
|
storage_path: str,
|
3722
3724
|
completed_parts: List[Dict[str, Any]],
|
3723
|
-
upload_id: str,
|
3725
|
+
upload_id: Optional[str],
|
3724
3726
|
complete_multipart_action: str = "Complete",
|
3725
3727
|
) -> Optional[str]:
|
3726
3728
|
mutation = gql(
|
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import re
|
6
6
|
import sys
|
7
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
import wandb
|
10
10
|
from wandb.sdk.artifacts.artifact import Artifact
|
@@ -28,6 +28,8 @@ FROZEN_REQUIREMENTS_FNAME = "requirements.frozen.txt"
|
|
28
28
|
JOB_FNAME = "wandb-job.json"
|
29
29
|
JOB_ARTIFACT_TYPE = "job"
|
30
30
|
|
31
|
+
LOG_LEVEL = Literal["log", "warn", "error"]
|
32
|
+
|
31
33
|
|
32
34
|
class GitInfo(TypedDict):
|
33
35
|
remote: str
|
@@ -89,8 +91,9 @@ class JobBuilder:
|
|
89
91
|
_job_seq_id: Optional[str]
|
90
92
|
_job_version_alias: Optional[str]
|
91
93
|
_is_notebook_run: bool
|
94
|
+
_verbose: bool
|
92
95
|
|
93
|
-
def __init__(self, settings: SettingsStatic):
|
96
|
+
def __init__(self, settings: SettingsStatic, verbose: bool = False):
|
94
97
|
self._settings = settings
|
95
98
|
self._metadatafile_path = None
|
96
99
|
self._requirements_path = None
|
@@ -106,6 +109,7 @@ class JobBuilder:
|
|
106
109
|
Literal["repo", "artifact", "image"]
|
107
110
|
] = settings.job_source # type: ignore[assignment]
|
108
111
|
self._is_notebook_run = self._get_is_notebook_run()
|
112
|
+
self._verbose = verbose
|
109
113
|
|
110
114
|
def set_config(self, config: Dict[str, Any]) -> None:
|
111
115
|
self._config = config
|
@@ -121,7 +125,9 @@ class JobBuilder:
|
|
121
125
|
def disable(self, val: bool) -> None:
|
122
126
|
self._disable = val
|
123
127
|
|
124
|
-
def _handle_server_artifact(
|
128
|
+
def _handle_server_artifact(
|
129
|
+
self, res: Optional[Dict], artifact: "ArtifactRecord"
|
130
|
+
) -> None:
|
125
131
|
if artifact.type == "job" and res is not None:
|
126
132
|
try:
|
127
133
|
if res["artifactSequence"]["latestArtifact"] is None:
|
@@ -135,7 +141,7 @@ class JobBuilder:
|
|
135
141
|
self._job_seq_id = res["artifactSequence"]["id"]
|
136
142
|
except KeyError as e:
|
137
143
|
_logger.info(f"Malformed response from ArtifactSaver.save {e}")
|
138
|
-
if artifact.type == "code" and
|
144
|
+
if artifact.type == "code" and res is not None:
|
139
145
|
self._logged_code_artifact = ArtifactInfoForJob(
|
140
146
|
{
|
141
147
|
"id": res["id"],
|
@@ -195,6 +201,21 @@ class JobBuilder:
|
|
195
201
|
|
196
202
|
return source, name
|
197
203
|
|
204
|
+
def _log_if_verbose(self, message: str, level: LOG_LEVEL) -> None:
|
205
|
+
log_func: Optional[Union[Callable[[Any], None], Callable[[Any], None]]] = None
|
206
|
+
if level == "log":
|
207
|
+
_logger.info(message)
|
208
|
+
log_func = wandb.termlog
|
209
|
+
elif level == "warn":
|
210
|
+
_logger.warning(message)
|
211
|
+
log_func = wandb.termwarn
|
212
|
+
elif level == "error":
|
213
|
+
_logger.error(message)
|
214
|
+
log_func = wandb.termerror
|
215
|
+
|
216
|
+
if self._verbose and log_func is not None:
|
217
|
+
log_func(message)
|
218
|
+
|
198
219
|
def _build_artifact_job_source(
|
199
220
|
self,
|
200
221
|
program_relpath: str,
|
@@ -210,8 +231,9 @@ class JobBuilder:
|
|
210
231
|
# at the directory the notebook is in instead of the jupyter core
|
211
232
|
if not os.path.exists(os.path.basename(program_relpath)):
|
212
233
|
_logger.info("target path does not exist, exiting")
|
213
|
-
|
214
|
-
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job"
|
234
|
+
self._log_if_verbose(
|
235
|
+
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job",
|
236
|
+
"warn",
|
215
237
|
)
|
216
238
|
return None, None
|
217
239
|
full_program_relpath = os.path.basename(program_relpath)
|
@@ -297,22 +319,25 @@ class JobBuilder:
|
|
297
319
|
if not os.path.exists(
|
298
320
|
os.path.join(self._settings.files_dir, REQUIREMENTS_FNAME)
|
299
321
|
):
|
300
|
-
|
301
|
-
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
322
|
+
self._log_if_verbose(
|
323
|
+
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
324
|
+
"warn",
|
302
325
|
)
|
303
326
|
return None
|
304
327
|
metadata = self._handle_metadata_file()
|
305
328
|
if metadata is None:
|
306
|
-
|
307
|
-
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables"
|
329
|
+
self._log_if_verbose(
|
330
|
+
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables",
|
331
|
+
"warn",
|
308
332
|
)
|
309
333
|
return None
|
310
334
|
|
311
335
|
runtime: Optional[str] = metadata.get("python")
|
312
336
|
# can't build a job without a python version
|
313
337
|
if runtime is None:
|
314
|
-
|
315
|
-
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
338
|
+
self._log_if_verbose(
|
339
|
+
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
340
|
+
"warn",
|
316
341
|
)
|
317
342
|
return None
|
318
343
|
|
@@ -343,13 +368,16 @@ class JobBuilder:
|
|
343
368
|
or self._settings.job_source
|
344
369
|
or self._source_type
|
345
370
|
):
|
346
|
-
|
371
|
+
self._log_if_verbose(
|
372
|
+
"No source type found, not creating job artifact", "warn"
|
373
|
+
)
|
347
374
|
return None
|
348
375
|
|
349
376
|
program_relpath = self._get_program_relpath(source_type, metadata)
|
350
377
|
if source_type != "image" and not program_relpath:
|
351
|
-
|
352
|
-
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
378
|
+
self._log_if_verbose(
|
379
|
+
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
380
|
+
"warn",
|
353
381
|
)
|
354
382
|
return None
|
355
383
|
|
@@ -375,10 +403,11 @@ class JobBuilder:
|
|
375
403
|
|
376
404
|
if source is None:
|
377
405
|
if source_type:
|
378
|
-
|
406
|
+
self._log_if_verbose(
|
379
407
|
f"Source type is set to '{source_type}' but some required information is missing "
|
380
408
|
"from the environment. A job will not be created from this run. See "
|
381
|
-
"https://docs.wandb.ai/guides/launch/create-job"
|
409
|
+
"https://docs.wandb.ai/guides/launch/create-job",
|
410
|
+
"warn",
|
382
411
|
)
|
383
412
|
return None
|
384
413
|
|
@@ -445,8 +474,9 @@ class JobBuilder:
|
|
445
474
|
program = metadata.get("program")
|
446
475
|
|
447
476
|
if not program:
|
448
|
-
|
449
|
-
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job"
|
477
|
+
self._log_if_verbose(
|
478
|
+
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job",
|
479
|
+
"warn",
|
450
480
|
)
|
451
481
|
|
452
482
|
return program
|
wandb/sdk/internal/profiler.py
CHANGED
@@ -52,7 +52,7 @@ def torch_trace_handler():
|
|
52
52
|
prof.step()
|
53
53
|
```
|
54
54
|
"""
|
55
|
-
from
|
55
|
+
from wandb.util import parse_version
|
56
56
|
|
57
57
|
torch = wandb.util.get_module(PYTORCH_MODULE, required=True)
|
58
58
|
torch_profiler = wandb.util.get_module(PYTORCH_PROFILER_MODULE, required=True)
|