wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/importers/__init__.py +1 -4
- wandb/apis/importers/internals/internal.py +386 -0
- wandb/apis/importers/internals/protocols.py +125 -0
- wandb/apis/importers/internals/util.py +78 -0
- wandb/apis/importers/mlflow.py +125 -88
- wandb/apis/importers/validation.py +108 -0
- wandb/apis/importers/wandb.py +1604 -0
- wandb/apis/public/api.py +7 -10
- wandb/apis/public/artifacts.py +38 -0
- wandb/apis/public/files.py +11 -2
- wandb/apis/reports/v2/__init__.py +0 -19
- wandb/apis/reports/v2/expr_parsing.py +0 -1
- wandb/apis/reports/v2/interface.py +15 -18
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +52 -55
- wandb/integration/gym/__init__.py +2 -1
- wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
- wandb/integration/keras/keras.py +6 -4
- wandb/integration/kfp/kfp_patch.py +2 -2
- wandb/integration/openai/fine_tuning.py +1 -2
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +75 -31
- wandb/sdk/artifacts/artifact_manifest.py +5 -2
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
- wandb/sdk/artifacts/artifact_saver.py +19 -47
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
- wandb/sdk/artifacts/storage_policy.py +4 -1
- wandb/sdk/data_types/base_types/wb_value.py +1 -1
- wandb/sdk/data_types/image.py +2 -2
- wandb/sdk/interface/interface.py +49 -13
- wandb/sdk/interface/interface_shared.py +17 -11
- wandb/sdk/internal/file_stream.py +20 -1
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +3 -1
- wandb/sdk/internal/job_builder.py +49 -19
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/sender.py +96 -124
- wandb/sdk/internal/sender_config.py +197 -0
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +5 -3
- wandb/sdk/internal/update.py +1 -1
- wandb/sdk/launch/_launch.py +3 -3
- wandb/sdk/launch/_launch_add.py +28 -29
- wandb/sdk/launch/_project_spec.py +148 -136
- wandb/sdk/launch/agent/agent.py +3 -7
- wandb/sdk/launch/agent/config.py +0 -27
- wandb/sdk/launch/builder/build.py +54 -28
- wandb/sdk/launch/builder/docker_builder.py +4 -15
- wandb/sdk/launch/builder/kaniko_builder.py +72 -45
- wandb/sdk/launch/create_job.py +6 -40
- wandb/sdk/launch/loader.py +10 -0
- wandb/sdk/launch/registry/anon.py +29 -0
- wandb/sdk/launch/registry/local_registry.py +4 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/runner/local_container.py +15 -10
- wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
- wandb/sdk/launch/sweeps/scheduler.py +11 -3
- wandb/sdk/launch/utils.py +14 -0
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +4 -1
- wandb/sdk/lib/apikey.py +0 -5
- wandb/sdk/lib/config_util.py +0 -31
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +72 -0
- wandb/sdk/service/service.py +7 -2
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_init.py +12 -1
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +164 -110
- wandb/sdk/wandb_settings.py +58 -16
- wandb/testing/relay.py +5 -6
- wandb/util.py +50 -7
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
- wandb/apis/importers/base.py +0 -400
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0
wandb/sdk/internal/sender.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""sender."""
|
2
2
|
|
3
|
-
import concurrent.futures
|
4
3
|
import json
|
5
4
|
import logging
|
6
5
|
import os
|
@@ -18,12 +17,10 @@ from typing import (
|
|
18
17
|
Dict,
|
19
18
|
Generator,
|
20
19
|
List,
|
21
|
-
NewType,
|
22
20
|
Optional,
|
23
21
|
Tuple,
|
24
22
|
Type,
|
25
23
|
Union,
|
26
|
-
cast,
|
27
24
|
)
|
28
25
|
|
29
26
|
import requests
|
@@ -43,6 +40,7 @@ from wandb.sdk.internal import (
|
|
43
40
|
file_stream,
|
44
41
|
internal_api,
|
45
42
|
job_builder,
|
43
|
+
sender_config,
|
46
44
|
update,
|
47
45
|
)
|
48
46
|
from wandb.sdk.internal.file_pusher import FilePusher
|
@@ -85,9 +83,6 @@ if TYPE_CHECKING:
|
|
85
83
|
logger = logging.getLogger(__name__)
|
86
84
|
|
87
85
|
|
88
|
-
DictWithValues = NewType("DictWithValues", Dict[str, Any])
|
89
|
-
DictNoValues = NewType("DictNoValues", Dict[str, Any])
|
90
|
-
|
91
86
|
_OUTPUT_MIN_CALLBACK_INTERVAL = 2 # seconds
|
92
87
|
|
93
88
|
|
@@ -120,6 +115,7 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
|
120
115
|
"ref": content.ref if content.ref else None,
|
121
116
|
"size": content.size if content.size is not None else None,
|
122
117
|
"local_path": content.local_path if content.local_path else None,
|
118
|
+
"skip_cache": content.skip_cache,
|
123
119
|
"extra": {
|
124
120
|
extra.key: json.loads(extra.value_json) for extra in content.extra
|
125
121
|
},
|
@@ -263,8 +259,9 @@ class SendManager:
|
|
263
259
|
self._project = None
|
264
260
|
|
265
261
|
# keep track of config from key/val updates
|
266
|
-
self._consolidated_config
|
267
|
-
|
262
|
+
self._consolidated_config = sender_config.ConfigState()
|
263
|
+
|
264
|
+
self._start_time: int = 0
|
268
265
|
self._telemetry_obj = telemetry.TelemetryRecord()
|
269
266
|
self._config_metric_pbdict_list: List[Dict[int, Any]] = []
|
270
267
|
self._metadata_summary: Dict[str, Any] = defaultdict()
|
@@ -540,7 +537,7 @@ class SendManager:
|
|
540
537
|
self._maybe_update_config(always=final)
|
541
538
|
|
542
539
|
def _debounce_config(self) -> None:
|
543
|
-
config_value_dict = self.
|
540
|
+
config_value_dict = self._config_backend_dict()
|
544
541
|
# TODO(jhr): check result of upsert_run?
|
545
542
|
if self._run:
|
546
543
|
self._api.upsert_run(
|
@@ -737,18 +734,7 @@ class SendManager:
|
|
737
734
|
)
|
738
735
|
self._respond_result(result)
|
739
736
|
|
740
|
-
def
|
741
|
-
"""Respond to a request for a job link."""
|
742
|
-
result = proto_util._result_from_record(record)
|
743
|
-
result.response.job_info_response.sequenceId = (
|
744
|
-
self._job_builder._job_seq_id or ""
|
745
|
-
)
|
746
|
-
result.response.job_info_response.version = (
|
747
|
-
self._job_builder._job_version_alias or ""
|
748
|
-
)
|
749
|
-
self._respond_result(result)
|
750
|
-
|
751
|
-
def _maybe_setup_resume(
|
737
|
+
def _setup_resume(
|
752
738
|
self, run: "RunRecord"
|
753
739
|
) -> Optional["wandb_internal_pb2.ErrorInfo"]:
|
754
740
|
"""Queries the backend for a run; fail if the settings are incompatible."""
|
@@ -856,51 +842,20 @@ class SendManager:
|
|
856
842
|
)
|
857
843
|
return framework
|
858
844
|
|
859
|
-
def
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
config_dict[wandb_key]["framework"] = s
|
874
|
-
s = self._telemetry_obj.huggingface_version
|
875
|
-
if s:
|
876
|
-
config_dict[wandb_key]["huggingface_version"] = s
|
877
|
-
b = self._telemetry_obj.env.jupyter
|
878
|
-
config_dict[wandb_key]["is_jupyter_run"] = b
|
879
|
-
b = self._telemetry_obj.env.kaggle
|
880
|
-
config_dict[wandb_key]["is_kaggle_kernel"] = b
|
881
|
-
|
882
|
-
config_dict[wandb_key]["start_time"] = self._start_time
|
883
|
-
|
884
|
-
t: Dict[int, Any] = proto_util.proto_encode_to_dict(self._telemetry_obj)
|
885
|
-
config_dict[wandb_key]["t"] = t
|
886
|
-
|
887
|
-
def _config_metric_update(self, config_dict: Dict[str, Any]) -> None:
|
888
|
-
"""Add default xaxis to config."""
|
889
|
-
if not self._config_metric_pbdict_list:
|
890
|
-
return
|
891
|
-
wandb_key = "_wandb"
|
892
|
-
config_dict.setdefault(wandb_key, dict())
|
893
|
-
config_dict[wandb_key]["m"] = self._config_metric_pbdict_list
|
894
|
-
|
895
|
-
def _config_format(self, config_data: Optional[DictNoValues]) -> DictWithValues:
|
896
|
-
"""Format dict into value dict with telemetry info."""
|
897
|
-
config_dict: Dict[str, Any] = config_data.copy() if config_data else dict()
|
898
|
-
self._config_telemetry_update(config_dict)
|
899
|
-
self._config_metric_update(config_dict)
|
900
|
-
config_value_dict: DictWithValues = config_util.dict_add_value_dict(config_dict)
|
901
|
-
return config_value_dict
|
902
|
-
|
903
|
-
def _config_save(self, config_value_dict: DictWithValues) -> None:
|
845
|
+
def _config_backend_dict(self) -> sender_config.BackendConfigDict:
|
846
|
+
config = self._consolidated_config or sender_config.ConfigState()
|
847
|
+
|
848
|
+
return config.to_backend_dict(
|
849
|
+
telemetry_record=self._telemetry_obj,
|
850
|
+
framework=self._telemetry_get_framework(),
|
851
|
+
start_time_millis=self._start_time,
|
852
|
+
metric_pbdicts=self._config_metric_pbdict_list,
|
853
|
+
)
|
854
|
+
|
855
|
+
def _config_save(
|
856
|
+
self,
|
857
|
+
config_value_dict: sender_config.BackendConfigDict,
|
858
|
+
) -> None:
|
904
859
|
config_path = os.path.join(self._settings.files_dir, "config.yaml")
|
905
860
|
config_util.save_config_file_from_dict(config_path, config_value_dict)
|
906
861
|
|
@@ -925,13 +880,37 @@ class SendManager:
|
|
925
880
|
pass
|
926
881
|
# TODO: do something if sync spell is not successful?
|
927
882
|
|
883
|
+
def _setup_fork(self, server_run: dict):
|
884
|
+
assert self._settings.fork_from
|
885
|
+
assert self._settings.fork_from.metric == "_step"
|
886
|
+
assert self._run
|
887
|
+
first_step = int(self._settings.fork_from.value) + 1
|
888
|
+
self._resume_state.step = first_step
|
889
|
+
self._resume_state.history = server_run.get("historyLineCount", 0)
|
890
|
+
self._run.forked = True
|
891
|
+
self._run.starting_step = first_step
|
892
|
+
|
893
|
+
def _handle_error(
|
894
|
+
self,
|
895
|
+
record: "Record",
|
896
|
+
error: "wandb_internal_pb2.ErrorInfo",
|
897
|
+
run: "RunRecord",
|
898
|
+
) -> None:
|
899
|
+
if record.control.req_resp or record.control.mailbox_slot:
|
900
|
+
result = proto_util._result_from_record(record)
|
901
|
+
result.run_result.run.CopyFrom(run)
|
902
|
+
result.run_result.error.CopyFrom(error)
|
903
|
+
self._respond_result(result)
|
904
|
+
else:
|
905
|
+
logger.error("Got error in async mode: %s", error.message)
|
906
|
+
|
928
907
|
def send_run(self, record: "Record", file_dir: Optional[str] = None) -> None:
|
929
908
|
run = record.run
|
930
909
|
error = None
|
931
910
|
is_wandb_init = self._run is None
|
932
911
|
|
933
912
|
# save start time of a run
|
934
|
-
self._start_time = run.start_time.ToMicroseconds()
|
913
|
+
self._start_time = run.start_time.ToMicroseconds() // 1e6
|
935
914
|
|
936
915
|
# update telemetry
|
937
916
|
if run.telemetry:
|
@@ -940,61 +919,69 @@ class SendManager:
|
|
940
919
|
self._telemetry_obj.feature.sync = True
|
941
920
|
|
942
921
|
# build config dict
|
943
|
-
config_value_dict: Optional[
|
922
|
+
config_value_dict: Optional[sender_config.BackendConfigDict] = None
|
944
923
|
if run.config:
|
945
|
-
|
946
|
-
config_value_dict = self.
|
924
|
+
self._consolidated_config.update_from_proto(run.config)
|
925
|
+
config_value_dict = self._config_backend_dict()
|
947
926
|
self._config_save(config_value_dict)
|
948
927
|
|
928
|
+
do_fork = self._settings.fork_from is not None and is_wandb_init
|
929
|
+
do_resume = bool(self._settings.resume)
|
930
|
+
|
931
|
+
if do_fork and do_resume:
|
932
|
+
error = wandb_internal_pb2.ErrorInfo()
|
933
|
+
error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
|
934
|
+
error.message = (
|
935
|
+
"You cannot use `resume` and `fork_from` together. Please choose one."
|
936
|
+
)
|
937
|
+
self._handle_error(record, error, run)
|
938
|
+
|
949
939
|
if is_wandb_init:
|
950
940
|
# Ensure we have a project to query for status
|
951
941
|
if run.project == "":
|
952
942
|
run.project = util.auto_project_name(self._settings.program)
|
953
943
|
# Only check resume status on `wandb.init`
|
954
|
-
|
944
|
+
|
945
|
+
if do_resume:
|
946
|
+
error = self._setup_resume(run)
|
955
947
|
|
956
948
|
if error is not None:
|
957
|
-
|
958
|
-
result = proto_util._result_from_record(record)
|
959
|
-
result.run_result.run.CopyFrom(run)
|
960
|
-
result.run_result.error.CopyFrom(error)
|
961
|
-
self._respond_result(result)
|
962
|
-
else:
|
963
|
-
logger.error("Got error in async mode: %s", error.message)
|
949
|
+
self._handle_error(record, error, run)
|
964
950
|
return
|
965
951
|
|
966
952
|
# Save the resumed config
|
967
953
|
if self._resume_state.config is not None:
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
self._consolidated_config.update(config_dict)
|
974
|
-
config_value_dict = self._config_format(self._consolidated_config)
|
954
|
+
self._consolidated_config.merge_resumed_config(
|
955
|
+
config_util.dict_strip_value_dict(self._resume_state.config)
|
956
|
+
)
|
957
|
+
|
958
|
+
config_value_dict = self._config_backend_dict()
|
975
959
|
self._config_save(config_value_dict)
|
976
960
|
|
977
961
|
# handle empty config
|
978
962
|
# TODO(jhr): consolidate the 4 ways config is built:
|
979
963
|
# (passed config, empty config, resume config, send_config)
|
980
964
|
if not config_value_dict:
|
981
|
-
config_value_dict = self.
|
965
|
+
config_value_dict = self._config_backend_dict()
|
982
966
|
self._config_save(config_value_dict)
|
983
967
|
|
984
968
|
try:
|
985
|
-
self._init_run(run, config_value_dict)
|
969
|
+
server_run = self._init_run(run, config_value_dict)
|
986
970
|
except (CommError, UsageError) as e:
|
987
971
|
logger.error(e, exc_info=True)
|
988
|
-
|
989
|
-
|
990
|
-
result.run_result.run.CopyFrom(run)
|
991
|
-
error = ProtobufErrorHandler.from_exception(e)
|
992
|
-
result.run_result.error.CopyFrom(error)
|
993
|
-
self._respond_result(result)
|
972
|
+
error = ProtobufErrorHandler.from_exception(e)
|
973
|
+
self._handle_error(record, error, run)
|
994
974
|
return
|
995
975
|
|
996
976
|
assert self._run # self._run is configured in _init_run()
|
997
977
|
|
978
|
+
if do_fork:
|
979
|
+
error = self._setup_fork(server_run)
|
980
|
+
|
981
|
+
if error is not None:
|
982
|
+
self._handle_error(record, error, run)
|
983
|
+
return
|
984
|
+
|
998
985
|
if record.control.req_resp or record.control.mailbox_slot:
|
999
986
|
result = proto_util._result_from_record(record)
|
1000
987
|
# TODO: we could do self._interface.publish_defer(resp) to notify
|
@@ -1012,8 +999,8 @@ class SendManager:
|
|
1012
999
|
def _init_run(
|
1013
1000
|
self,
|
1014
1001
|
run: "RunRecord",
|
1015
|
-
config_dict: Optional[
|
1016
|
-
) ->
|
1002
|
+
config_dict: Optional[sender_config.BackendConfigDict],
|
1003
|
+
) -> dict:
|
1017
1004
|
# We subtract the previous runs runtime when resuming
|
1018
1005
|
start_time = (
|
1019
1006
|
run.start_time.ToMicroseconds() / 1e6
|
@@ -1098,6 +1085,7 @@ class SendManager:
|
|
1098
1085
|
self._run.sweep_id = sweep_id
|
1099
1086
|
if os.getenv("SPELL_RUN_URL"):
|
1100
1087
|
self._sync_spell()
|
1088
|
+
return server_run
|
1101
1089
|
|
1102
1090
|
def _start_run_threads(self, file_dir: Optional[str] = None) -> None:
|
1103
1091
|
assert self._run # self._run is configured by caller
|
@@ -1344,8 +1332,7 @@ class SendManager:
|
|
1344
1332
|
self._config_needs_debounce = True
|
1345
1333
|
|
1346
1334
|
def send_config(self, record: "Record") -> None:
|
1347
|
-
|
1348
|
-
config_util.update_from_proto(self._consolidated_config, cfg)
|
1335
|
+
self._consolidated_config.update_from_proto(record.config)
|
1349
1336
|
self._update_config()
|
1350
1337
|
|
1351
1338
|
def send_metric(self, record: "Record") -> None:
|
@@ -1457,40 +1444,27 @@ class SendManager:
|
|
1457
1444
|
)
|
1458
1445
|
|
1459
1446
|
def send_request_log_artifact(self, record: "Record") -> None:
|
1460
|
-
assert record.control.
|
1447
|
+
assert record.control.req_resp
|
1461
1448
|
result = proto_util._result_from_record(record)
|
1462
1449
|
artifact = record.request.log_artifact.artifact
|
1463
1450
|
history_step = record.request.log_artifact.history_step
|
1464
1451
|
|
1465
|
-
future = None
|
1466
1452
|
try:
|
1467
|
-
res
|
1453
|
+
res = self._send_artifact(artifact, history_step)
|
1468
1454
|
assert res, "Unable to send artifact"
|
1469
|
-
result.response.log_artifact_response.artifact_id = res
|
1455
|
+
result.response.log_artifact_response.artifact_id = res["id"]
|
1470
1456
|
logger.info(f"logged artifact {artifact.name} - {res}")
|
1471
1457
|
except Exception as e:
|
1472
1458
|
result.response.log_artifact_response.error_message = (
|
1473
1459
|
f'error logging artifact "{artifact.type}/{artifact.name}": {e}'
|
1474
1460
|
)
|
1475
1461
|
|
1476
|
-
|
1477
|
-
if fut.exception() is not None:
|
1478
|
-
result.response.log_artifact_response.error_message = f'error logging artifact "{artifact.type}/{artifact.name}": {fut.exception()}'
|
1479
|
-
self._respond_result(result)
|
1480
|
-
|
1481
|
-
if future is not None:
|
1482
|
-
# respond to the request only after the artifact is fully committed
|
1483
|
-
future.add_done_callback(_respond_result)
|
1484
|
-
else:
|
1485
|
-
self._respond_result(result)
|
1462
|
+
self._respond_result(result)
|
1486
1463
|
|
1487
1464
|
def send_artifact(self, record: "Record") -> None:
|
1488
1465
|
artifact = record.artifact
|
1489
1466
|
try:
|
1490
|
-
res
|
1491
|
-
# wait for future to complete in send artifact
|
1492
|
-
if future is not None:
|
1493
|
-
future.result()
|
1467
|
+
res = self._send_artifact(artifact)
|
1494
1468
|
logger.info(f"sent artifact {artifact.name} - {res}")
|
1495
1469
|
except Exception as e:
|
1496
1470
|
logger.error(
|
@@ -1501,8 +1475,8 @@ class SendManager:
|
|
1501
1475
|
|
1502
1476
|
def _send_artifact(
|
1503
1477
|
self, artifact: "ArtifactRecord", history_step: Optional[int] = None
|
1504
|
-
) ->
|
1505
|
-
from
|
1478
|
+
) -> Optional[Dict]:
|
1479
|
+
from wandb.util import parse_version
|
1506
1480
|
|
1507
1481
|
assert self._pusher
|
1508
1482
|
saver = ArtifactSaver(
|
@@ -1522,10 +1496,10 @@ class SendManager:
|
|
1522
1496
|
"This W&B Server doesn't support distributed artifacts, "
|
1523
1497
|
"have your administrator install wandb/local >= 0.9.37"
|
1524
1498
|
)
|
1525
|
-
return
|
1499
|
+
return None
|
1526
1500
|
|
1527
1501
|
metadata = json.loads(artifact.metadata) if artifact.metadata else None
|
1528
|
-
res
|
1502
|
+
res = saver.save(
|
1529
1503
|
type=artifact.type,
|
1530
1504
|
name=artifact.name,
|
1531
1505
|
client_id=artifact.client_id,
|
@@ -1543,10 +1517,10 @@ class SendManager:
|
|
1543
1517
|
)
|
1544
1518
|
|
1545
1519
|
self._job_builder._handle_server_artifact(res, artifact)
|
1546
|
-
return res
|
1520
|
+
return res
|
1547
1521
|
|
1548
1522
|
def send_alert(self, record: "Record") -> None:
|
1549
|
-
from
|
1523
|
+
from wandb.util import parse_version
|
1550
1524
|
|
1551
1525
|
alert = record.alert
|
1552
1526
|
max_cli_version = self._max_cli_version()
|
@@ -1639,9 +1613,7 @@ class SendManager:
|
|
1639
1613
|
def _flush_job(self) -> None:
|
1640
1614
|
if self._job_builder.disable or self._settings._offline:
|
1641
1615
|
return
|
1642
|
-
self._job_builder.set_config(
|
1643
|
-
{k: v for k, v in self._consolidated_config.items() if k != "_wandb"}
|
1644
|
-
)
|
1616
|
+
self._job_builder.set_config(self._consolidated_config.non_internal_config())
|
1645
1617
|
summary_dict = self._cached_summary.copy()
|
1646
1618
|
summary_dict.pop("_wandb", None)
|
1647
1619
|
self._job_builder.set_summary(summary_dict)
|
@@ -0,0 +1,197 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Any, Dict, NewType, Optional, Sequence
|
3
|
+
|
4
|
+
from wandb.proto import wandb_internal_pb2
|
5
|
+
from wandb.sdk.lib import proto_util, telemetry
|
6
|
+
|
7
|
+
BackendConfigDict = NewType("BackendConfigDict", Dict[str, Any])
|
8
|
+
"""Run config dictionary in the format used by the backend."""
|
9
|
+
|
10
|
+
_WANDB_INTERNAL_KEY = "_wandb"
|
11
|
+
|
12
|
+
|
13
|
+
class ConfigState:
|
14
|
+
"""The configuration of a run."""
|
15
|
+
|
16
|
+
def __init__(self, tree: Optional[Dict[str, Any]] = None) -> None:
|
17
|
+
self._tree: Dict[str, Any] = tree or {}
|
18
|
+
"""A tree with string-valued nodes and JSON leaves.
|
19
|
+
|
20
|
+
Leaves are Python objects that are valid JSON values:
|
21
|
+
|
22
|
+
* Primitives like strings and numbers
|
23
|
+
* Dictionaries from strings to JSON objects
|
24
|
+
* Lists of JSON objects
|
25
|
+
"""
|
26
|
+
|
27
|
+
def non_internal_config(self) -> Dict[str, Any]:
|
28
|
+
"""Returns the config settings minus "_wandb"."""
|
29
|
+
return {k: v for k, v in self._tree.items() if k != _WANDB_INTERNAL_KEY}
|
30
|
+
|
31
|
+
def update_from_proto(
|
32
|
+
self,
|
33
|
+
config_record: wandb_internal_pb2.ConfigRecord,
|
34
|
+
) -> None:
|
35
|
+
"""Applies update and remove commands."""
|
36
|
+
for config_item in config_record.update:
|
37
|
+
self._update_at_path(
|
38
|
+
_key_path(config_item),
|
39
|
+
json.loads(config_item.value_json),
|
40
|
+
)
|
41
|
+
|
42
|
+
for config_item in config_record.remove:
|
43
|
+
self._delete_at_path(_key_path(config_item))
|
44
|
+
|
45
|
+
def merge_resumed_config(self, old_config_tree: Dict[str, Any]) -> None:
|
46
|
+
"""Merges the config from a run that's being resumed."""
|
47
|
+
# Add any top-level keys that aren't already set.
|
48
|
+
self._add_unset_keys_from_subtree(old_config_tree, [])
|
49
|
+
|
50
|
+
# Unfortunately, when a user logs visualizations, we store them in the
|
51
|
+
# run's config. When resuming a run, we want to avoid erasing previously
|
52
|
+
# logged visualizations, hence this special handling:
|
53
|
+
self._add_unset_keys_from_subtree(
|
54
|
+
old_config_tree,
|
55
|
+
[_WANDB_INTERNAL_KEY, "visualize"],
|
56
|
+
)
|
57
|
+
self._add_unset_keys_from_subtree(
|
58
|
+
old_config_tree,
|
59
|
+
[_WANDB_INTERNAL_KEY, "viz"],
|
60
|
+
)
|
61
|
+
|
62
|
+
def _add_unset_keys_from_subtree(
|
63
|
+
self,
|
64
|
+
old_config_tree: Dict[str, Any],
|
65
|
+
path: Sequence[str],
|
66
|
+
) -> None:
|
67
|
+
"""Uses the given subtree for keys that aren't already set."""
|
68
|
+
old_subtree = _subtree(old_config_tree, path, create=False)
|
69
|
+
if not old_subtree:
|
70
|
+
return
|
71
|
+
|
72
|
+
new_subtree = _subtree(self._tree, path, create=True)
|
73
|
+
assert new_subtree is not None
|
74
|
+
|
75
|
+
for key, value in old_subtree.items():
|
76
|
+
if key not in new_subtree:
|
77
|
+
new_subtree[key] = value
|
78
|
+
|
79
|
+
def to_backend_dict(
|
80
|
+
self,
|
81
|
+
telemetry_record: telemetry.TelemetryRecord,
|
82
|
+
framework: Optional[str],
|
83
|
+
start_time_millis: int,
|
84
|
+
metric_pbdicts: Sequence[Dict[int, Any]],
|
85
|
+
) -> BackendConfigDict:
|
86
|
+
"""Returns a dictionary representation expected by the backend.
|
87
|
+
|
88
|
+
The backend expects the configuration in a specific format, and the
|
89
|
+
config is also used to store additional metadata about the run.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
telemetry_record: Telemetry information to insert.
|
93
|
+
framework: The detected framework used in the run (e.g. TensorFlow).
|
94
|
+
start_time_millis: The run's start time in Unix milliseconds.
|
95
|
+
metric_pbdicts: List of dict representations of metric protobuffers.
|
96
|
+
"""
|
97
|
+
backend_dict = self._tree.copy()
|
98
|
+
wandb_internal = backend_dict.setdefault(_WANDB_INTERNAL_KEY, {})
|
99
|
+
|
100
|
+
###################################################
|
101
|
+
# Telemetry information
|
102
|
+
###################################################
|
103
|
+
py_version = telemetry_record.python_version
|
104
|
+
if py_version:
|
105
|
+
wandb_internal["python_version"] = py_version
|
106
|
+
|
107
|
+
cli_version = telemetry_record.cli_version
|
108
|
+
if cli_version:
|
109
|
+
wandb_internal["cli_version"] = cli_version
|
110
|
+
|
111
|
+
if framework:
|
112
|
+
wandb_internal["framework"] = framework
|
113
|
+
|
114
|
+
huggingface_version = telemetry_record.huggingface_version
|
115
|
+
if huggingface_version:
|
116
|
+
wandb_internal["huggingface_version"] = huggingface_version
|
117
|
+
|
118
|
+
wandb_internal["is_jupyter_run"] = telemetry_record.env.jupyter
|
119
|
+
wandb_internal["is_kaggle_kernel"] = telemetry_record.env.kaggle
|
120
|
+
wandb_internal["start_time"] = start_time_millis
|
121
|
+
|
122
|
+
# The full telemetry record.
|
123
|
+
wandb_internal["t"] = proto_util.proto_encode_to_dict(telemetry_record)
|
124
|
+
|
125
|
+
###################################################
|
126
|
+
# Metrics
|
127
|
+
###################################################
|
128
|
+
if metric_pbdicts:
|
129
|
+
wandb_internal["m"] = metric_pbdicts
|
130
|
+
|
131
|
+
return BackendConfigDict(
|
132
|
+
{
|
133
|
+
key: {
|
134
|
+
# Configurations can be stored in a hand-written YAML file,
|
135
|
+
# and users can add descriptions to their hyperparameters
|
136
|
+
# there. However, we don't support a way to set descriptions
|
137
|
+
# via code, so this is always None.
|
138
|
+
"desc": None,
|
139
|
+
"value": value,
|
140
|
+
}
|
141
|
+
for key, value in self._tree.items()
|
142
|
+
}
|
143
|
+
)
|
144
|
+
|
145
|
+
def _update_at_path(
|
146
|
+
self,
|
147
|
+
key_path: Sequence[str],
|
148
|
+
value: Any,
|
149
|
+
) -> None:
|
150
|
+
"""Sets the value at the path in the config tree."""
|
151
|
+
subtree = _subtree(self._tree, key_path[:-1], create=True)
|
152
|
+
assert subtree is not None
|
153
|
+
|
154
|
+
subtree[key_path[-1]] = value
|
155
|
+
|
156
|
+
def _delete_at_path(
|
157
|
+
self,
|
158
|
+
key_path: Sequence[str],
|
159
|
+
) -> None:
|
160
|
+
"""Removes the subtree at the path in the config tree."""
|
161
|
+
subtree = _subtree(self._tree, key_path[:-1], create=False)
|
162
|
+
if subtree:
|
163
|
+
del subtree[key_path[-1]]
|
164
|
+
|
165
|
+
|
166
|
+
def _key_path(config_item: wandb_internal_pb2.ConfigItem) -> Sequence[str]:
|
167
|
+
"""Returns the key path referenced by the config item."""
|
168
|
+
if config_item.nested_key:
|
169
|
+
return config_item.nested_key
|
170
|
+
elif config_item.key:
|
171
|
+
return [config_item.key]
|
172
|
+
else:
|
173
|
+
raise AssertionError(
|
174
|
+
"Invalid ConfigItem: either key or nested_key must be set",
|
175
|
+
)
|
176
|
+
|
177
|
+
|
178
|
+
def _subtree(
|
179
|
+
tree: Dict[str, Any],
|
180
|
+
key_path: Sequence[str],
|
181
|
+
*,
|
182
|
+
create: bool = False,
|
183
|
+
) -> Optional[Dict[str, Any]]:
|
184
|
+
"""Returns a subtree at the given path."""
|
185
|
+
for key in key_path:
|
186
|
+
subtree = tree.get(key)
|
187
|
+
|
188
|
+
if not subtree:
|
189
|
+
if create:
|
190
|
+
subtree = {}
|
191
|
+
tree[key] = subtree
|
192
|
+
else:
|
193
|
+
return None
|
194
|
+
|
195
|
+
tree = subtree
|
196
|
+
|
197
|
+
return tree
|
@@ -2,6 +2,7 @@ from dataclasses import fields
|
|
2
2
|
from typing import Any, Iterable, Sequence, Tuple
|
3
3
|
|
4
4
|
from wandb.proto import wandb_settings_pb2
|
5
|
+
from wandb.sdk.lib import RunMoment
|
5
6
|
from wandb.sdk.wandb_settings import SettingsData
|
6
7
|
|
7
8
|
|
@@ -38,6 +39,14 @@ class SettingsStatic(SettingsData):
|
|
38
39
|
unpacked_inner[inner_key] = inner_value
|
39
40
|
unpacked_mapping[outer_key] = unpacked_inner
|
40
41
|
value = unpacked_mapping
|
42
|
+
elif key == "fork_from":
|
43
|
+
value = getattr(proto, key)
|
44
|
+
if value.run:
|
45
|
+
value = RunMoment(
|
46
|
+
run=value.run, value=value.value, metric=value.metric
|
47
|
+
)
|
48
|
+
else:
|
49
|
+
value = None
|
41
50
|
else:
|
42
51
|
if proto.HasField(key): # type: ignore [arg-type]
|
43
52
|
value = getattr(proto, key).value
|
@@ -14,7 +14,6 @@ from wandb.sdk.internal.settings_static import SettingsStatic
|
|
14
14
|
from wandb.sdk.lib import filesystem
|
15
15
|
from wandb.sdk.lib.filenames import CONDA_ENVIRONMENTS_FNAME, DIFF_FNAME, METADATA_FNAME
|
16
16
|
from wandb.sdk.lib.gitlib import GitRepo
|
17
|
-
from wandb.sdk.wandb_settings import _get_program_relpath
|
18
17
|
|
19
18
|
from .assets.interfaces import Interface
|
20
19
|
|
@@ -168,7 +167,7 @@ class SystemInfo:
|
|
168
167
|
data["program"] = self.settings.program
|
169
168
|
# Used during artifact-job creation, always points to the relpath
|
170
169
|
# of code execution, even when in a git repo
|
171
|
-
data["codePathLocal"] =
|
170
|
+
data["codePathLocal"] = self.settings._code_path_local
|
172
171
|
if not self.settings.disable_code:
|
173
172
|
if self.settings.program_relpath:
|
174
173
|
data["codePath"] = self.settings.program_relpath
|
@@ -213,7 +212,10 @@ class SystemInfo:
|
|
213
212
|
os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
|
214
213
|
) as f:
|
215
214
|
subprocess.call(
|
216
|
-
["conda", "env", "export"],
|
215
|
+
["conda", "env", "export"],
|
216
|
+
stdout=f,
|
217
|
+
stderr=subprocess.DEVNULL,
|
218
|
+
timeout=15, # add timeout since conda env export could take a really long time
|
217
219
|
)
|
218
220
|
except Exception as e:
|
219
221
|
logger.exception(f"Error saving conda packages: {e}")
|
wandb/sdk/internal/update.py
CHANGED
@@ -8,7 +8,7 @@ import wandb
|
|
8
8
|
def _find_available(
|
9
9
|
current_version: str,
|
10
10
|
) -> Optional[Tuple[str, bool, bool, bool, Optional[str]]]:
|
11
|
-
from
|
11
|
+
from wandb.util import parse_version
|
12
12
|
|
13
13
|
pypi_url = f"https://pypi.org/pypi/{wandb._wandb_module}/json"
|
14
14
|
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -10,7 +10,7 @@ import wandb
|
|
10
10
|
from wandb.apis.internal import Api
|
11
11
|
|
12
12
|
from . import loader
|
13
|
-
from ._project_spec import
|
13
|
+
from ._project_spec import LaunchProject
|
14
14
|
from .agent import LaunchAgent
|
15
15
|
from .builder.build import construct_agent_configs
|
16
16
|
from .environment.local_environment import LocalEnvironment
|
@@ -221,8 +221,8 @@ async def _launch(
|
|
221
221
|
author=None,
|
222
222
|
)
|
223
223
|
validate_launch_spec_source(launch_spec)
|
224
|
-
launch_project =
|
225
|
-
launch_project
|
224
|
+
launch_project = LaunchProject.from_spec(launch_spec, api)
|
225
|
+
launch_project.fetch_and_validate_project()
|
226
226
|
entrypoint = launch_project.get_single_entry_point()
|
227
227
|
image_uri = launch_project.docker_image # Either set by user or None.
|
228
228
|
|