wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/importers/__init__.py +1 -4
- wandb/apis/importers/internals/internal.py +386 -0
- wandb/apis/importers/internals/protocols.py +125 -0
- wandb/apis/importers/internals/util.py +78 -0
- wandb/apis/importers/mlflow.py +125 -88
- wandb/apis/importers/validation.py +108 -0
- wandb/apis/importers/wandb.py +1604 -0
- wandb/apis/public/api.py +7 -10
- wandb/apis/public/artifacts.py +38 -0
- wandb/apis/public/files.py +11 -2
- wandb/apis/reports/v2/__init__.py +0 -19
- wandb/apis/reports/v2/expr_parsing.py +0 -1
- wandb/apis/reports/v2/interface.py +15 -18
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +52 -55
- wandb/integration/gym/__init__.py +2 -1
- wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
- wandb/integration/keras/keras.py +6 -4
- wandb/integration/kfp/kfp_patch.py +2 -2
- wandb/integration/openai/fine_tuning.py +1 -2
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +75 -31
- wandb/sdk/artifacts/artifact_manifest.py +5 -2
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
- wandb/sdk/artifacts/artifact_saver.py +19 -47
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
- wandb/sdk/artifacts/storage_policy.py +4 -1
- wandb/sdk/data_types/base_types/wb_value.py +1 -1
- wandb/sdk/data_types/image.py +2 -2
- wandb/sdk/interface/interface.py +49 -13
- wandb/sdk/interface/interface_shared.py +17 -11
- wandb/sdk/internal/file_stream.py +20 -1
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +3 -1
- wandb/sdk/internal/job_builder.py +49 -19
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/sender.py +96 -124
- wandb/sdk/internal/sender_config.py +197 -0
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +5 -3
- wandb/sdk/internal/update.py +1 -1
- wandb/sdk/launch/_launch.py +3 -3
- wandb/sdk/launch/_launch_add.py +28 -29
- wandb/sdk/launch/_project_spec.py +148 -136
- wandb/sdk/launch/agent/agent.py +3 -7
- wandb/sdk/launch/agent/config.py +0 -27
- wandb/sdk/launch/builder/build.py +54 -28
- wandb/sdk/launch/builder/docker_builder.py +4 -15
- wandb/sdk/launch/builder/kaniko_builder.py +72 -45
- wandb/sdk/launch/create_job.py +6 -40
- wandb/sdk/launch/loader.py +10 -0
- wandb/sdk/launch/registry/anon.py +29 -0
- wandb/sdk/launch/registry/local_registry.py +4 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/runner/local_container.py +15 -10
- wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
- wandb/sdk/launch/sweeps/scheduler.py +11 -3
- wandb/sdk/launch/utils.py +14 -0
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +4 -1
- wandb/sdk/lib/apikey.py +0 -5
- wandb/sdk/lib/config_util.py +0 -31
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +72 -0
- wandb/sdk/service/service.py +7 -2
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_init.py +12 -1
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +164 -110
- wandb/sdk/wandb_settings.py +58 -16
- wandb/testing/relay.py +5 -6
- wandb/util.py +50 -7
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
- wandb/apis/importers/base.py +0 -400
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0
wandb/sdk/internal/sender.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""sender."""
|
2
2
|
|
3
|
-
import concurrent.futures
|
4
3
|
import json
|
5
4
|
import logging
|
6
5
|
import os
|
@@ -18,12 +17,10 @@ from typing import (
|
|
18
17
|
Dict,
|
19
18
|
Generator,
|
20
19
|
List,
|
21
|
-
NewType,
|
22
20
|
Optional,
|
23
21
|
Tuple,
|
24
22
|
Type,
|
25
23
|
Union,
|
26
|
-
cast,
|
27
24
|
)
|
28
25
|
|
29
26
|
import requests
|
@@ -43,6 +40,7 @@ from wandb.sdk.internal import (
|
|
43
40
|
file_stream,
|
44
41
|
internal_api,
|
45
42
|
job_builder,
|
43
|
+
sender_config,
|
46
44
|
update,
|
47
45
|
)
|
48
46
|
from wandb.sdk.internal.file_pusher import FilePusher
|
@@ -85,9 +83,6 @@ if TYPE_CHECKING:
|
|
85
83
|
logger = logging.getLogger(__name__)
|
86
84
|
|
87
85
|
|
88
|
-
DictWithValues = NewType("DictWithValues", Dict[str, Any])
|
89
|
-
DictNoValues = NewType("DictNoValues", Dict[str, Any])
|
90
|
-
|
91
86
|
_OUTPUT_MIN_CALLBACK_INTERVAL = 2 # seconds
|
92
87
|
|
93
88
|
|
@@ -120,6 +115,7 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
|
120
115
|
"ref": content.ref if content.ref else None,
|
121
116
|
"size": content.size if content.size is not None else None,
|
122
117
|
"local_path": content.local_path if content.local_path else None,
|
118
|
+
"skip_cache": content.skip_cache,
|
123
119
|
"extra": {
|
124
120
|
extra.key: json.loads(extra.value_json) for extra in content.extra
|
125
121
|
},
|
@@ -263,8 +259,9 @@ class SendManager:
|
|
263
259
|
self._project = None
|
264
260
|
|
265
261
|
# keep track of config from key/val updates
|
266
|
-
self._consolidated_config
|
267
|
-
|
262
|
+
self._consolidated_config = sender_config.ConfigState()
|
263
|
+
|
264
|
+
self._start_time: int = 0
|
268
265
|
self._telemetry_obj = telemetry.TelemetryRecord()
|
269
266
|
self._config_metric_pbdict_list: List[Dict[int, Any]] = []
|
270
267
|
self._metadata_summary: Dict[str, Any] = defaultdict()
|
@@ -540,7 +537,7 @@ class SendManager:
|
|
540
537
|
self._maybe_update_config(always=final)
|
541
538
|
|
542
539
|
def _debounce_config(self) -> None:
|
543
|
-
config_value_dict = self.
|
540
|
+
config_value_dict = self._config_backend_dict()
|
544
541
|
# TODO(jhr): check result of upsert_run?
|
545
542
|
if self._run:
|
546
543
|
self._api.upsert_run(
|
@@ -737,18 +734,7 @@ class SendManager:
|
|
737
734
|
)
|
738
735
|
self._respond_result(result)
|
739
736
|
|
740
|
-
def
|
741
|
-
"""Respond to a request for a job link."""
|
742
|
-
result = proto_util._result_from_record(record)
|
743
|
-
result.response.job_info_response.sequenceId = (
|
744
|
-
self._job_builder._job_seq_id or ""
|
745
|
-
)
|
746
|
-
result.response.job_info_response.version = (
|
747
|
-
self._job_builder._job_version_alias or ""
|
748
|
-
)
|
749
|
-
self._respond_result(result)
|
750
|
-
|
751
|
-
def _maybe_setup_resume(
|
737
|
+
def _setup_resume(
|
752
738
|
self, run: "RunRecord"
|
753
739
|
) -> Optional["wandb_internal_pb2.ErrorInfo"]:
|
754
740
|
"""Queries the backend for a run; fail if the settings are incompatible."""
|
@@ -856,51 +842,20 @@ class SendManager:
|
|
856
842
|
)
|
857
843
|
return framework
|
858
844
|
|
859
|
-
def
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
config_dict[wandb_key]["framework"] = s
|
874
|
-
s = self._telemetry_obj.huggingface_version
|
875
|
-
if s:
|
876
|
-
config_dict[wandb_key]["huggingface_version"] = s
|
877
|
-
b = self._telemetry_obj.env.jupyter
|
878
|
-
config_dict[wandb_key]["is_jupyter_run"] = b
|
879
|
-
b = self._telemetry_obj.env.kaggle
|
880
|
-
config_dict[wandb_key]["is_kaggle_kernel"] = b
|
881
|
-
|
882
|
-
config_dict[wandb_key]["start_time"] = self._start_time
|
883
|
-
|
884
|
-
t: Dict[int, Any] = proto_util.proto_encode_to_dict(self._telemetry_obj)
|
885
|
-
config_dict[wandb_key]["t"] = t
|
886
|
-
|
887
|
-
def _config_metric_update(self, config_dict: Dict[str, Any]) -> None:
|
888
|
-
"""Add default xaxis to config."""
|
889
|
-
if not self._config_metric_pbdict_list:
|
890
|
-
return
|
891
|
-
wandb_key = "_wandb"
|
892
|
-
config_dict.setdefault(wandb_key, dict())
|
893
|
-
config_dict[wandb_key]["m"] = self._config_metric_pbdict_list
|
894
|
-
|
895
|
-
def _config_format(self, config_data: Optional[DictNoValues]) -> DictWithValues:
|
896
|
-
"""Format dict into value dict with telemetry info."""
|
897
|
-
config_dict: Dict[str, Any] = config_data.copy() if config_data else dict()
|
898
|
-
self._config_telemetry_update(config_dict)
|
899
|
-
self._config_metric_update(config_dict)
|
900
|
-
config_value_dict: DictWithValues = config_util.dict_add_value_dict(config_dict)
|
901
|
-
return config_value_dict
|
902
|
-
|
903
|
-
def _config_save(self, config_value_dict: DictWithValues) -> None:
|
845
|
+
def _config_backend_dict(self) -> sender_config.BackendConfigDict:
|
846
|
+
config = self._consolidated_config or sender_config.ConfigState()
|
847
|
+
|
848
|
+
return config.to_backend_dict(
|
849
|
+
telemetry_record=self._telemetry_obj,
|
850
|
+
framework=self._telemetry_get_framework(),
|
851
|
+
start_time_millis=self._start_time,
|
852
|
+
metric_pbdicts=self._config_metric_pbdict_list,
|
853
|
+
)
|
854
|
+
|
855
|
+
def _config_save(
|
856
|
+
self,
|
857
|
+
config_value_dict: sender_config.BackendConfigDict,
|
858
|
+
) -> None:
|
904
859
|
config_path = os.path.join(self._settings.files_dir, "config.yaml")
|
905
860
|
config_util.save_config_file_from_dict(config_path, config_value_dict)
|
906
861
|
|
@@ -925,13 +880,37 @@ class SendManager:
|
|
925
880
|
pass
|
926
881
|
# TODO: do something if sync spell is not successful?
|
927
882
|
|
883
|
+
def _setup_fork(self, server_run: dict):
|
884
|
+
assert self._settings.fork_from
|
885
|
+
assert self._settings.fork_from.metric == "_step"
|
886
|
+
assert self._run
|
887
|
+
first_step = int(self._settings.fork_from.value) + 1
|
888
|
+
self._resume_state.step = first_step
|
889
|
+
self._resume_state.history = server_run.get("historyLineCount", 0)
|
890
|
+
self._run.forked = True
|
891
|
+
self._run.starting_step = first_step
|
892
|
+
|
893
|
+
def _handle_error(
|
894
|
+
self,
|
895
|
+
record: "Record",
|
896
|
+
error: "wandb_internal_pb2.ErrorInfo",
|
897
|
+
run: "RunRecord",
|
898
|
+
) -> None:
|
899
|
+
if record.control.req_resp or record.control.mailbox_slot:
|
900
|
+
result = proto_util._result_from_record(record)
|
901
|
+
result.run_result.run.CopyFrom(run)
|
902
|
+
result.run_result.error.CopyFrom(error)
|
903
|
+
self._respond_result(result)
|
904
|
+
else:
|
905
|
+
logger.error("Got error in async mode: %s", error.message)
|
906
|
+
|
928
907
|
def send_run(self, record: "Record", file_dir: Optional[str] = None) -> None:
|
929
908
|
run = record.run
|
930
909
|
error = None
|
931
910
|
is_wandb_init = self._run is None
|
932
911
|
|
933
912
|
# save start time of a run
|
934
|
-
self._start_time = run.start_time.ToMicroseconds()
|
913
|
+
self._start_time = run.start_time.ToMicroseconds() // 1e6
|
935
914
|
|
936
915
|
# update telemetry
|
937
916
|
if run.telemetry:
|
@@ -940,61 +919,69 @@ class SendManager:
|
|
940
919
|
self._telemetry_obj.feature.sync = True
|
941
920
|
|
942
921
|
# build config dict
|
943
|
-
config_value_dict: Optional[
|
922
|
+
config_value_dict: Optional[sender_config.BackendConfigDict] = None
|
944
923
|
if run.config:
|
945
|
-
|
946
|
-
config_value_dict = self.
|
924
|
+
self._consolidated_config.update_from_proto(run.config)
|
925
|
+
config_value_dict = self._config_backend_dict()
|
947
926
|
self._config_save(config_value_dict)
|
948
927
|
|
928
|
+
do_fork = self._settings.fork_from is not None and is_wandb_init
|
929
|
+
do_resume = bool(self._settings.resume)
|
930
|
+
|
931
|
+
if do_fork and do_resume:
|
932
|
+
error = wandb_internal_pb2.ErrorInfo()
|
933
|
+
error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
|
934
|
+
error.message = (
|
935
|
+
"You cannot use `resume` and `fork_from` together. Please choose one."
|
936
|
+
)
|
937
|
+
self._handle_error(record, error, run)
|
938
|
+
|
949
939
|
if is_wandb_init:
|
950
940
|
# Ensure we have a project to query for status
|
951
941
|
if run.project == "":
|
952
942
|
run.project = util.auto_project_name(self._settings.program)
|
953
943
|
# Only check resume status on `wandb.init`
|
954
|
-
|
944
|
+
|
945
|
+
if do_resume:
|
946
|
+
error = self._setup_resume(run)
|
955
947
|
|
956
948
|
if error is not None:
|
957
|
-
|
958
|
-
result = proto_util._result_from_record(record)
|
959
|
-
result.run_result.run.CopyFrom(run)
|
960
|
-
result.run_result.error.CopyFrom(error)
|
961
|
-
self._respond_result(result)
|
962
|
-
else:
|
963
|
-
logger.error("Got error in async mode: %s", error.message)
|
949
|
+
self._handle_error(record, error, run)
|
964
950
|
return
|
965
951
|
|
966
952
|
# Save the resumed config
|
967
953
|
if self._resume_state.config is not None:
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
self._consolidated_config.update(config_dict)
|
974
|
-
config_value_dict = self._config_format(self._consolidated_config)
|
954
|
+
self._consolidated_config.merge_resumed_config(
|
955
|
+
config_util.dict_strip_value_dict(self._resume_state.config)
|
956
|
+
)
|
957
|
+
|
958
|
+
config_value_dict = self._config_backend_dict()
|
975
959
|
self._config_save(config_value_dict)
|
976
960
|
|
977
961
|
# handle empty config
|
978
962
|
# TODO(jhr): consolidate the 4 ways config is built:
|
979
963
|
# (passed config, empty config, resume config, send_config)
|
980
964
|
if not config_value_dict:
|
981
|
-
config_value_dict = self.
|
965
|
+
config_value_dict = self._config_backend_dict()
|
982
966
|
self._config_save(config_value_dict)
|
983
967
|
|
984
968
|
try:
|
985
|
-
self._init_run(run, config_value_dict)
|
969
|
+
server_run = self._init_run(run, config_value_dict)
|
986
970
|
except (CommError, UsageError) as e:
|
987
971
|
logger.error(e, exc_info=True)
|
988
|
-
|
989
|
-
|
990
|
-
result.run_result.run.CopyFrom(run)
|
991
|
-
error = ProtobufErrorHandler.from_exception(e)
|
992
|
-
result.run_result.error.CopyFrom(error)
|
993
|
-
self._respond_result(result)
|
972
|
+
error = ProtobufErrorHandler.from_exception(e)
|
973
|
+
self._handle_error(record, error, run)
|
994
974
|
return
|
995
975
|
|
996
976
|
assert self._run # self._run is configured in _init_run()
|
997
977
|
|
978
|
+
if do_fork:
|
979
|
+
error = self._setup_fork(server_run)
|
980
|
+
|
981
|
+
if error is not None:
|
982
|
+
self._handle_error(record, error, run)
|
983
|
+
return
|
984
|
+
|
998
985
|
if record.control.req_resp or record.control.mailbox_slot:
|
999
986
|
result = proto_util._result_from_record(record)
|
1000
987
|
# TODO: we could do self._interface.publish_defer(resp) to notify
|
@@ -1012,8 +999,8 @@ class SendManager:
|
|
1012
999
|
def _init_run(
|
1013
1000
|
self,
|
1014
1001
|
run: "RunRecord",
|
1015
|
-
config_dict: Optional[
|
1016
|
-
) ->
|
1002
|
+
config_dict: Optional[sender_config.BackendConfigDict],
|
1003
|
+
) -> dict:
|
1017
1004
|
# We subtract the previous runs runtime when resuming
|
1018
1005
|
start_time = (
|
1019
1006
|
run.start_time.ToMicroseconds() / 1e6
|
@@ -1098,6 +1085,7 @@ class SendManager:
|
|
1098
1085
|
self._run.sweep_id = sweep_id
|
1099
1086
|
if os.getenv("SPELL_RUN_URL"):
|
1100
1087
|
self._sync_spell()
|
1088
|
+
return server_run
|
1101
1089
|
|
1102
1090
|
def _start_run_threads(self, file_dir: Optional[str] = None) -> None:
|
1103
1091
|
assert self._run # self._run is configured by caller
|
@@ -1344,8 +1332,7 @@ class SendManager:
|
|
1344
1332
|
self._config_needs_debounce = True
|
1345
1333
|
|
1346
1334
|
def send_config(self, record: "Record") -> None:
|
1347
|
-
|
1348
|
-
config_util.update_from_proto(self._consolidated_config, cfg)
|
1335
|
+
self._consolidated_config.update_from_proto(record.config)
|
1349
1336
|
self._update_config()
|
1350
1337
|
|
1351
1338
|
def send_metric(self, record: "Record") -> None:
|
@@ -1457,40 +1444,27 @@ class SendManager:
|
|
1457
1444
|
)
|
1458
1445
|
|
1459
1446
|
def send_request_log_artifact(self, record: "Record") -> None:
|
1460
|
-
assert record.control.
|
1447
|
+
assert record.control.req_resp
|
1461
1448
|
result = proto_util._result_from_record(record)
|
1462
1449
|
artifact = record.request.log_artifact.artifact
|
1463
1450
|
history_step = record.request.log_artifact.history_step
|
1464
1451
|
|
1465
|
-
future = None
|
1466
1452
|
try:
|
1467
|
-
res
|
1453
|
+
res = self._send_artifact(artifact, history_step)
|
1468
1454
|
assert res, "Unable to send artifact"
|
1469
|
-
result.response.log_artifact_response.artifact_id = res
|
1455
|
+
result.response.log_artifact_response.artifact_id = res["id"]
|
1470
1456
|
logger.info(f"logged artifact {artifact.name} - {res}")
|
1471
1457
|
except Exception as e:
|
1472
1458
|
result.response.log_artifact_response.error_message = (
|
1473
1459
|
f'error logging artifact "{artifact.type}/{artifact.name}": {e}'
|
1474
1460
|
)
|
1475
1461
|
|
1476
|
-
|
1477
|
-
if fut.exception() is not None:
|
1478
|
-
result.response.log_artifact_response.error_message = f'error logging artifact "{artifact.type}/{artifact.name}": {fut.exception()}'
|
1479
|
-
self._respond_result(result)
|
1480
|
-
|
1481
|
-
if future is not None:
|
1482
|
-
# respond to the request only after the artifact is fully committed
|
1483
|
-
future.add_done_callback(_respond_result)
|
1484
|
-
else:
|
1485
|
-
self._respond_result(result)
|
1462
|
+
self._respond_result(result)
|
1486
1463
|
|
1487
1464
|
def send_artifact(self, record: "Record") -> None:
|
1488
1465
|
artifact = record.artifact
|
1489
1466
|
try:
|
1490
|
-
res
|
1491
|
-
# wait for future to complete in send artifact
|
1492
|
-
if future is not None:
|
1493
|
-
future.result()
|
1467
|
+
res = self._send_artifact(artifact)
|
1494
1468
|
logger.info(f"sent artifact {artifact.name} - {res}")
|
1495
1469
|
except Exception as e:
|
1496
1470
|
logger.error(
|
@@ -1501,8 +1475,8 @@ class SendManager:
|
|
1501
1475
|
|
1502
1476
|
def _send_artifact(
|
1503
1477
|
self, artifact: "ArtifactRecord", history_step: Optional[int] = None
|
1504
|
-
) ->
|
1505
|
-
from
|
1478
|
+
) -> Optional[Dict]:
|
1479
|
+
from wandb.util import parse_version
|
1506
1480
|
|
1507
1481
|
assert self._pusher
|
1508
1482
|
saver = ArtifactSaver(
|
@@ -1522,10 +1496,10 @@ class SendManager:
|
|
1522
1496
|
"This W&B Server doesn't support distributed artifacts, "
|
1523
1497
|
"have your administrator install wandb/local >= 0.9.37"
|
1524
1498
|
)
|
1525
|
-
return
|
1499
|
+
return None
|
1526
1500
|
|
1527
1501
|
metadata = json.loads(artifact.metadata) if artifact.metadata else None
|
1528
|
-
res
|
1502
|
+
res = saver.save(
|
1529
1503
|
type=artifact.type,
|
1530
1504
|
name=artifact.name,
|
1531
1505
|
client_id=artifact.client_id,
|
@@ -1543,10 +1517,10 @@ class SendManager:
|
|
1543
1517
|
)
|
1544
1518
|
|
1545
1519
|
self._job_builder._handle_server_artifact(res, artifact)
|
1546
|
-
return res
|
1520
|
+
return res
|
1547
1521
|
|
1548
1522
|
def send_alert(self, record: "Record") -> None:
|
1549
|
-
from
|
1523
|
+
from wandb.util import parse_version
|
1550
1524
|
|
1551
1525
|
alert = record.alert
|
1552
1526
|
max_cli_version = self._max_cli_version()
|
@@ -1639,9 +1613,7 @@ class SendManager:
|
|
1639
1613
|
def _flush_job(self) -> None:
|
1640
1614
|
if self._job_builder.disable or self._settings._offline:
|
1641
1615
|
return
|
1642
|
-
self._job_builder.set_config(
|
1643
|
-
{k: v for k, v in self._consolidated_config.items() if k != "_wandb"}
|
1644
|
-
)
|
1616
|
+
self._job_builder.set_config(self._consolidated_config.non_internal_config())
|
1645
1617
|
summary_dict = self._cached_summary.copy()
|
1646
1618
|
summary_dict.pop("_wandb", None)
|
1647
1619
|
self._job_builder.set_summary(summary_dict)
|
@@ -0,0 +1,197 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Any, Dict, NewType, Optional, Sequence
|
3
|
+
|
4
|
+
from wandb.proto import wandb_internal_pb2
|
5
|
+
from wandb.sdk.lib import proto_util, telemetry
|
6
|
+
|
7
|
+
BackendConfigDict = NewType("BackendConfigDict", Dict[str, Any])
|
8
|
+
"""Run config dictionary in the format used by the backend."""
|
9
|
+
|
10
|
+
_WANDB_INTERNAL_KEY = "_wandb"
|
11
|
+
|
12
|
+
|
13
|
+
class ConfigState:
|
14
|
+
"""The configuration of a run."""
|
15
|
+
|
16
|
+
def __init__(self, tree: Optional[Dict[str, Any]] = None) -> None:
|
17
|
+
self._tree: Dict[str, Any] = tree or {}
|
18
|
+
"""A tree with string-valued nodes and JSON leaves.
|
19
|
+
|
20
|
+
Leaves are Python objects that are valid JSON values:
|
21
|
+
|
22
|
+
* Primitives like strings and numbers
|
23
|
+
* Dictionaries from strings to JSON objects
|
24
|
+
* Lists of JSON objects
|
25
|
+
"""
|
26
|
+
|
27
|
+
def non_internal_config(self) -> Dict[str, Any]:
|
28
|
+
"""Returns the config settings minus "_wandb"."""
|
29
|
+
return {k: v for k, v in self._tree.items() if k != _WANDB_INTERNAL_KEY}
|
30
|
+
|
31
|
+
def update_from_proto(
|
32
|
+
self,
|
33
|
+
config_record: wandb_internal_pb2.ConfigRecord,
|
34
|
+
) -> None:
|
35
|
+
"""Applies update and remove commands."""
|
36
|
+
for config_item in config_record.update:
|
37
|
+
self._update_at_path(
|
38
|
+
_key_path(config_item),
|
39
|
+
json.loads(config_item.value_json),
|
40
|
+
)
|
41
|
+
|
42
|
+
for config_item in config_record.remove:
|
43
|
+
self._delete_at_path(_key_path(config_item))
|
44
|
+
|
45
|
+
def merge_resumed_config(self, old_config_tree: Dict[str, Any]) -> None:
|
46
|
+
"""Merges the config from a run that's being resumed."""
|
47
|
+
# Add any top-level keys that aren't already set.
|
48
|
+
self._add_unset_keys_from_subtree(old_config_tree, [])
|
49
|
+
|
50
|
+
# Unfortunately, when a user logs visualizations, we store them in the
|
51
|
+
# run's config. When resuming a run, we want to avoid erasing previously
|
52
|
+
# logged visualizations, hence this special handling:
|
53
|
+
self._add_unset_keys_from_subtree(
|
54
|
+
old_config_tree,
|
55
|
+
[_WANDB_INTERNAL_KEY, "visualize"],
|
56
|
+
)
|
57
|
+
self._add_unset_keys_from_subtree(
|
58
|
+
old_config_tree,
|
59
|
+
[_WANDB_INTERNAL_KEY, "viz"],
|
60
|
+
)
|
61
|
+
|
62
|
+
def _add_unset_keys_from_subtree(
|
63
|
+
self,
|
64
|
+
old_config_tree: Dict[str, Any],
|
65
|
+
path: Sequence[str],
|
66
|
+
) -> None:
|
67
|
+
"""Uses the given subtree for keys that aren't already set."""
|
68
|
+
old_subtree = _subtree(old_config_tree, path, create=False)
|
69
|
+
if not old_subtree:
|
70
|
+
return
|
71
|
+
|
72
|
+
new_subtree = _subtree(self._tree, path, create=True)
|
73
|
+
assert new_subtree is not None
|
74
|
+
|
75
|
+
for key, value in old_subtree.items():
|
76
|
+
if key not in new_subtree:
|
77
|
+
new_subtree[key] = value
|
78
|
+
|
79
|
+
def to_backend_dict(
|
80
|
+
self,
|
81
|
+
telemetry_record: telemetry.TelemetryRecord,
|
82
|
+
framework: Optional[str],
|
83
|
+
start_time_millis: int,
|
84
|
+
metric_pbdicts: Sequence[Dict[int, Any]],
|
85
|
+
) -> BackendConfigDict:
|
86
|
+
"""Returns a dictionary representation expected by the backend.
|
87
|
+
|
88
|
+
The backend expects the configuration in a specific format, and the
|
89
|
+
config is also used to store additional metadata about the run.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
telemetry_record: Telemetry information to insert.
|
93
|
+
framework: The detected framework used in the run (e.g. TensorFlow).
|
94
|
+
start_time_millis: The run's start time in Unix milliseconds.
|
95
|
+
metric_pbdicts: List of dict representations of metric protobuffers.
|
96
|
+
"""
|
97
|
+
backend_dict = self._tree.copy()
|
98
|
+
wandb_internal = backend_dict.setdefault(_WANDB_INTERNAL_KEY, {})
|
99
|
+
|
100
|
+
###################################################
|
101
|
+
# Telemetry information
|
102
|
+
###################################################
|
103
|
+
py_version = telemetry_record.python_version
|
104
|
+
if py_version:
|
105
|
+
wandb_internal["python_version"] = py_version
|
106
|
+
|
107
|
+
cli_version = telemetry_record.cli_version
|
108
|
+
if cli_version:
|
109
|
+
wandb_internal["cli_version"] = cli_version
|
110
|
+
|
111
|
+
if framework:
|
112
|
+
wandb_internal["framework"] = framework
|
113
|
+
|
114
|
+
huggingface_version = telemetry_record.huggingface_version
|
115
|
+
if huggingface_version:
|
116
|
+
wandb_internal["huggingface_version"] = huggingface_version
|
117
|
+
|
118
|
+
wandb_internal["is_jupyter_run"] = telemetry_record.env.jupyter
|
119
|
+
wandb_internal["is_kaggle_kernel"] = telemetry_record.env.kaggle
|
120
|
+
wandb_internal["start_time"] = start_time_millis
|
121
|
+
|
122
|
+
# The full telemetry record.
|
123
|
+
wandb_internal["t"] = proto_util.proto_encode_to_dict(telemetry_record)
|
124
|
+
|
125
|
+
###################################################
|
126
|
+
# Metrics
|
127
|
+
###################################################
|
128
|
+
if metric_pbdicts:
|
129
|
+
wandb_internal["m"] = metric_pbdicts
|
130
|
+
|
131
|
+
return BackendConfigDict(
|
132
|
+
{
|
133
|
+
key: {
|
134
|
+
# Configurations can be stored in a hand-written YAML file,
|
135
|
+
# and users can add descriptions to their hyperparameters
|
136
|
+
# there. However, we don't support a way to set descriptions
|
137
|
+
# via code, so this is always None.
|
138
|
+
"desc": None,
|
139
|
+
"value": value,
|
140
|
+
}
|
141
|
+
for key, value in self._tree.items()
|
142
|
+
}
|
143
|
+
)
|
144
|
+
|
145
|
+
def _update_at_path(
|
146
|
+
self,
|
147
|
+
key_path: Sequence[str],
|
148
|
+
value: Any,
|
149
|
+
) -> None:
|
150
|
+
"""Sets the value at the path in the config tree."""
|
151
|
+
subtree = _subtree(self._tree, key_path[:-1], create=True)
|
152
|
+
assert subtree is not None
|
153
|
+
|
154
|
+
subtree[key_path[-1]] = value
|
155
|
+
|
156
|
+
def _delete_at_path(
|
157
|
+
self,
|
158
|
+
key_path: Sequence[str],
|
159
|
+
) -> None:
|
160
|
+
"""Removes the subtree at the path in the config tree."""
|
161
|
+
subtree = _subtree(self._tree, key_path[:-1], create=False)
|
162
|
+
if subtree:
|
163
|
+
del subtree[key_path[-1]]
|
164
|
+
|
165
|
+
|
166
|
+
def _key_path(config_item: wandb_internal_pb2.ConfigItem) -> Sequence[str]:
|
167
|
+
"""Returns the key path referenced by the config item."""
|
168
|
+
if config_item.nested_key:
|
169
|
+
return config_item.nested_key
|
170
|
+
elif config_item.key:
|
171
|
+
return [config_item.key]
|
172
|
+
else:
|
173
|
+
raise AssertionError(
|
174
|
+
"Invalid ConfigItem: either key or nested_key must be set",
|
175
|
+
)
|
176
|
+
|
177
|
+
|
178
|
+
def _subtree(
|
179
|
+
tree: Dict[str, Any],
|
180
|
+
key_path: Sequence[str],
|
181
|
+
*,
|
182
|
+
create: bool = False,
|
183
|
+
) -> Optional[Dict[str, Any]]:
|
184
|
+
"""Returns a subtree at the given path."""
|
185
|
+
for key in key_path:
|
186
|
+
subtree = tree.get(key)
|
187
|
+
|
188
|
+
if not subtree:
|
189
|
+
if create:
|
190
|
+
subtree = {}
|
191
|
+
tree[key] = subtree
|
192
|
+
else:
|
193
|
+
return None
|
194
|
+
|
195
|
+
tree = subtree
|
196
|
+
|
197
|
+
return tree
|
@@ -2,6 +2,7 @@ from dataclasses import fields
|
|
2
2
|
from typing import Any, Iterable, Sequence, Tuple
|
3
3
|
|
4
4
|
from wandb.proto import wandb_settings_pb2
|
5
|
+
from wandb.sdk.lib import RunMoment
|
5
6
|
from wandb.sdk.wandb_settings import SettingsData
|
6
7
|
|
7
8
|
|
@@ -38,6 +39,14 @@ class SettingsStatic(SettingsData):
|
|
38
39
|
unpacked_inner[inner_key] = inner_value
|
39
40
|
unpacked_mapping[outer_key] = unpacked_inner
|
40
41
|
value = unpacked_mapping
|
42
|
+
elif key == "fork_from":
|
43
|
+
value = getattr(proto, key)
|
44
|
+
if value.run:
|
45
|
+
value = RunMoment(
|
46
|
+
run=value.run, value=value.value, metric=value.metric
|
47
|
+
)
|
48
|
+
else:
|
49
|
+
value = None
|
41
50
|
else:
|
42
51
|
if proto.HasField(key): # type: ignore [arg-type]
|
43
52
|
value = getattr(proto, key).value
|
@@ -14,7 +14,6 @@ from wandb.sdk.internal.settings_static import SettingsStatic
|
|
14
14
|
from wandb.sdk.lib import filesystem
|
15
15
|
from wandb.sdk.lib.filenames import CONDA_ENVIRONMENTS_FNAME, DIFF_FNAME, METADATA_FNAME
|
16
16
|
from wandb.sdk.lib.gitlib import GitRepo
|
17
|
-
from wandb.sdk.wandb_settings import _get_program_relpath
|
18
17
|
|
19
18
|
from .assets.interfaces import Interface
|
20
19
|
|
@@ -168,7 +167,7 @@ class SystemInfo:
|
|
168
167
|
data["program"] = self.settings.program
|
169
168
|
# Used during artifact-job creation, always points to the relpath
|
170
169
|
# of code execution, even when in a git repo
|
171
|
-
data["codePathLocal"] =
|
170
|
+
data["codePathLocal"] = self.settings._code_path_local
|
172
171
|
if not self.settings.disable_code:
|
173
172
|
if self.settings.program_relpath:
|
174
173
|
data["codePath"] = self.settings.program_relpath
|
@@ -213,7 +212,10 @@ class SystemInfo:
|
|
213
212
|
os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
|
214
213
|
) as f:
|
215
214
|
subprocess.call(
|
216
|
-
["conda", "env", "export"],
|
215
|
+
["conda", "env", "export"],
|
216
|
+
stdout=f,
|
217
|
+
stderr=subprocess.DEVNULL,
|
218
|
+
timeout=15, # add timeout since conda env export could take a really long time
|
217
219
|
)
|
218
220
|
except Exception as e:
|
219
221
|
logger.exception(f"Error saving conda packages: {e}")
|
wandb/sdk/internal/update.py
CHANGED
@@ -8,7 +8,7 @@ import wandb
|
|
8
8
|
def _find_available(
|
9
9
|
current_version: str,
|
10
10
|
) -> Optional[Tuple[str, bool, bool, bool, Optional[str]]]:
|
11
|
-
from
|
11
|
+
from wandb.util import parse_version
|
12
12
|
|
13
13
|
pypi_url = f"https://pypi.org/pypi/{wandb._wandb_module}/json"
|
14
14
|
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -10,7 +10,7 @@ import wandb
|
|
10
10
|
from wandb.apis.internal import Api
|
11
11
|
|
12
12
|
from . import loader
|
13
|
-
from ._project_spec import
|
13
|
+
from ._project_spec import LaunchProject
|
14
14
|
from .agent import LaunchAgent
|
15
15
|
from .builder.build import construct_agent_configs
|
16
16
|
from .environment.local_environment import LocalEnvironment
|
@@ -221,8 +221,8 @@ async def _launch(
|
|
221
221
|
author=None,
|
222
222
|
)
|
223
223
|
validate_launch_spec_source(launch_spec)
|
224
|
-
launch_project =
|
225
|
-
launch_project
|
224
|
+
launch_project = LaunchProject.from_spec(launch_spec, api)
|
225
|
+
launch_project.fetch_and_validate_project()
|
226
226
|
entrypoint = launch_project.get_single_entry_point()
|
227
227
|
image_uri = launch_project.docker_image # Either set by user or None.
|
228
228
|
|