wandb 0.17.0rc2__py3-none-win32.whl → 0.17.2__py3-none-win32.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +4 -2
- wandb/apis/importers/internals/internal.py +0 -1
- wandb/apis/importers/wandb.py +12 -7
- wandb/apis/internal.py +0 -3
- wandb/apis/public/api.py +213 -79
- wandb/apis/public/artifacts.py +335 -100
- wandb/apis/public/files.py +9 -9
- wandb/apis/public/jobs.py +16 -4
- wandb/apis/public/projects.py +26 -28
- wandb/apis/public/query_generator.py +1 -1
- wandb/apis/public/runs.py +163 -65
- wandb/apis/public/sweeps.py +2 -2
- wandb/apis/reports/__init__.py +1 -7
- wandb/apis/reports/v1/__init__.py +5 -27
- wandb/apis/reports/v2/__init__.py +7 -19
- wandb/apis/workspaces/__init__.py +8 -0
- wandb/beta/workflows.py +8 -3
- wandb/bin/wandb-core +0 -0
- wandb/cli/cli.py +151 -59
- wandb/docker/__init__.py +1 -1
- wandb/errors/term.py +10 -2
- wandb/filesync/step_checksum.py +1 -4
- wandb/filesync/step_prepare.py +4 -24
- wandb/filesync/step_upload.py +5 -107
- wandb/filesync/upload_job.py +0 -76
- wandb/integration/gym/__init__.py +35 -15
- wandb/integration/openai/fine_tuning.py +21 -3
- wandb/integration/prodigy/prodigy.py +1 -1
- wandb/jupyter.py +16 -17
- wandb/old/summary.py +5 -0
- wandb/plot/pr_curve.py +2 -1
- wandb/plot/roc_curve.py +2 -1
- wandb/{plots → plot}/utils.py +13 -25
- wandb/proto/v3/wandb_internal_pb2.py +54 -54
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +54 -54
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_base_pb2.py +30 -0
- wandb/proto/v5/wandb_internal_pb2.py +355 -0
- wandb/proto/v5/wandb_server_pb2.py +63 -0
- wandb/proto/v5/wandb_settings_pb2.py +45 -0
- wandb/proto/v5/wandb_telemetry_pb2.py +41 -0
- wandb/proto/wandb_base_pb2.py +2 -0
- wandb/proto/wandb_deprecated.py +9 -1
- wandb/proto/wandb_generate_deprecated.py +34 -0
- wandb/proto/{wandb_internal_codegen.py → wandb_generate_proto.py} +1 -35
- wandb/proto/wandb_internal_pb2.py +2 -0
- wandb/proto/wandb_server_pb2.py +2 -0
- wandb/proto/wandb_settings_pb2.py +2 -0
- wandb/proto/wandb_telemetry_pb2.py +2 -0
- wandb/sdk/artifacts/artifact.py +76 -23
- wandb/sdk/artifacts/artifact_manifest.py +1 -1
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -3
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -1
- wandb/sdk/artifacts/artifact_saver.py +1 -10
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +6 -2
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -1
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +6 -4
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +2 -42
- wandb/sdk/artifacts/storage_policy.py +1 -12
- wandb/sdk/data_types/_dtypes.py +5 -2
- wandb/sdk/data_types/html.py +1 -1
- wandb/sdk/data_types/image.py +1 -1
- wandb/sdk/data_types/object_3d.py +1 -1
- wandb/sdk/data_types/video.py +4 -2
- wandb/sdk/interface/interface.py +13 -0
- wandb/sdk/interface/interface_shared.py +1 -1
- wandb/sdk/internal/file_pusher.py +2 -5
- wandb/sdk/internal/file_stream.py +6 -19
- wandb/sdk/internal/internal_api.py +160 -138
- wandb/sdk/internal/job_builder.py +207 -135
- wandb/sdk/internal/progress.py +0 -28
- wandb/sdk/internal/sender.py +105 -42
- wandb/sdk/internal/settings_static.py +8 -1
- wandb/sdk/internal/system/assets/gpu.py +2 -0
- wandb/sdk/internal/system/assets/trainium.py +3 -3
- wandb/sdk/internal/system/system_info.py +4 -2
- wandb/sdk/internal/update.py +1 -1
- wandb/sdk/launch/__init__.py +9 -1
- wandb/sdk/launch/_launch.py +4 -24
- wandb/sdk/launch/_launch_add.py +1 -3
- wandb/sdk/launch/_project_spec.py +184 -224
- wandb/sdk/launch/agent/agent.py +58 -18
- wandb/sdk/launch/agent/config.py +0 -3
- wandb/sdk/launch/builder/abstract.py +67 -0
- wandb/sdk/launch/builder/build.py +165 -576
- wandb/sdk/launch/builder/context_manager.py +235 -0
- wandb/sdk/launch/builder/docker_builder.py +7 -23
- wandb/sdk/launch/builder/kaniko_builder.py +10 -23
- wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
- wandb/sdk/launch/create_job.py +51 -45
- wandb/sdk/launch/environment/aws_environment.py +26 -1
- wandb/sdk/launch/inputs/files.py +148 -0
- wandb/sdk/launch/inputs/internal.py +224 -0
- wandb/sdk/launch/inputs/manage.py +95 -0
- wandb/sdk/launch/runner/abstract.py +2 -2
- wandb/sdk/launch/runner/kubernetes_monitor.py +45 -12
- wandb/sdk/launch/runner/kubernetes_runner.py +6 -8
- wandb/sdk/launch/runner/local_container.py +2 -3
- wandb/sdk/launch/runner/local_process.py +8 -29
- wandb/sdk/launch/runner/sagemaker_runner.py +20 -14
- wandb/sdk/launch/runner/vertex_runner.py +8 -7
- wandb/sdk/launch/sweeps/scheduler.py +2 -0
- wandb/sdk/launch/sweeps/utils.py +2 -2
- wandb/sdk/launch/utils.py +16 -138
- wandb/sdk/lib/_settings_toposort_generated.py +2 -5
- wandb/sdk/lib/apikey.py +4 -2
- wandb/sdk/lib/config_util.py +3 -3
- wandb/sdk/lib/proto_util.py +22 -1
- wandb/sdk/lib/redirect.py +1 -1
- wandb/sdk/service/service.py +2 -1
- wandb/sdk/service/streams.py +5 -5
- wandb/sdk/wandb_init.py +25 -59
- wandb/sdk/wandb_login.py +28 -25
- wandb/sdk/wandb_run.py +135 -70
- wandb/sdk/wandb_settings.py +33 -64
- wandb/sdk/wandb_watch.py +1 -1
- wandb/sklearn/plot/classifier.py +4 -6
- wandb/sync/sync.py +2 -2
- wandb/testing/relay.py +32 -17
- wandb/util.py +39 -37
- wandb/wandb_agent.py +3 -3
- wandb/wandb_controller.py +3 -2
- {wandb-0.17.0rc2.dist-info → wandb-0.17.2.dist-info}/METADATA +7 -9
- {wandb-0.17.0rc2.dist-info → wandb-0.17.2.dist-info}/RECORD +130 -152
- wandb/apis/reports/v1/_blocks.py +0 -1406
- wandb/apis/reports/v1/_helpers.py +0 -70
- wandb/apis/reports/v1/_panels.py +0 -1282
- wandb/apis/reports/v1/_templates.py +0 -478
- wandb/apis/reports/v1/blocks.py +0 -27
- wandb/apis/reports/v1/helpers.py +0 -2
- wandb/apis/reports/v1/mutations.py +0 -66
- wandb/apis/reports/v1/panels.py +0 -17
- wandb/apis/reports/v1/report.py +0 -268
- wandb/apis/reports/v1/runset.py +0 -144
- wandb/apis/reports/v1/templates.py +0 -7
- wandb/apis/reports/v1/util.py +0 -406
- wandb/apis/reports/v1/validators.py +0 -131
- wandb/apis/reports/v2/blocks.py +0 -25
- wandb/apis/reports/v2/expr_parsing.py +0 -257
- wandb/apis/reports/v2/gql.py +0 -68
- wandb/apis/reports/v2/interface.py +0 -1911
- wandb/apis/reports/v2/internal.py +0 -867
- wandb/apis/reports/v2/metrics.py +0 -6
- wandb/apis/reports/v2/panels.py +0 -15
- wandb/catboost/__init__.py +0 -9
- wandb/fastai/__init__.py +0 -9
- wandb/keras/__init__.py +0 -19
- wandb/lightgbm/__init__.py +0 -9
- wandb/plots/__init__.py +0 -6
- wandb/plots/explain_text.py +0 -36
- wandb/plots/heatmap.py +0 -81
- wandb/plots/named_entity.py +0 -43
- wandb/plots/part_of_speech.py +0 -50
- wandb/plots/plot_definitions.py +0 -768
- wandb/plots/precision_recall.py +0 -121
- wandb/plots/roc.py +0 -103
- wandb/sacred/__init__.py +0 -3
- wandb/xgboost/__init__.py +0 -9
- {wandb-0.17.0rc2.dist-info → wandb-0.17.2.dist-info}/WHEEL +0 -0
- {wandb-0.17.0rc2.dist-info → wandb-0.17.2.dist-info}/entry_points.txt +0 -0
- {wandb-0.17.0rc2.dist-info → wandb-0.17.2.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/internal/sender.py
CHANGED
@@ -39,7 +39,6 @@ from wandb.sdk.internal import (
|
|
39
39
|
datastore,
|
40
40
|
file_stream,
|
41
41
|
internal_api,
|
42
|
-
job_builder,
|
43
42
|
sender_config,
|
44
43
|
update,
|
45
44
|
)
|
@@ -217,6 +216,7 @@ class SendManager:
|
|
217
216
|
_record_exit: Optional["Record"]
|
218
217
|
_exit_result: Optional["RunExitResult"]
|
219
218
|
_resume_state: ResumeState
|
219
|
+
_rewind_response: Optional[Dict[str, Any]]
|
220
220
|
_cached_server_info: Dict[str, Any]
|
221
221
|
_cached_viewer: Dict[str, Any]
|
222
222
|
_server_messages: List[Dict[str, Any]]
|
@@ -276,6 +276,7 @@ class SendManager:
|
|
276
276
|
|
277
277
|
# State updated by resuming
|
278
278
|
self._resume_state = ResumeState()
|
279
|
+
self._rewind_response = None
|
279
280
|
|
280
281
|
# State added when run_exit is initiated and complete
|
281
282
|
self._record_exit = None
|
@@ -327,7 +328,6 @@ class SendManager:
|
|
327
328
|
# ignore_globs=(),
|
328
329
|
_sync=True,
|
329
330
|
disable_job_creation=False,
|
330
|
-
_async_upload_concurrency_limit=None,
|
331
331
|
_file_stream_timeout_seconds=0,
|
332
332
|
)
|
333
333
|
record_q: Queue[Record] = queue.Queue()
|
@@ -754,14 +754,14 @@ class SendManager:
|
|
754
754
|
project_name=run.project,
|
755
755
|
name=run.run_id,
|
756
756
|
)
|
757
|
-
|
758
|
-
if not resume_status:
|
757
|
+
# No resume status = run does not exist; No t key in wandbConfig = run exists but hasn't been inited
|
758
|
+
if not resume_status or '"t":' not in resume_status.get("wandbConfig", ""):
|
759
759
|
if self._settings.resume == "must":
|
760
760
|
error = wandb_internal_pb2.ErrorInfo()
|
761
761
|
error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
|
762
762
|
error.message = (
|
763
763
|
"You provided an invalid value for the `resume` argument."
|
764
|
-
f" The value 'must' is not a valid option for resuming a run ({run.run_id}) that
|
764
|
+
f" The value 'must' is not a valid option for resuming a run ({run.run_id}) that has not been initialized."
|
765
765
|
" Please check your inputs and try again with a valid run ID."
|
766
766
|
" If you are trying to start a new run, please omit the `resume` argument or use `resume='allow'`."
|
767
767
|
)
|
@@ -808,7 +808,9 @@ class SendManager:
|
|
808
808
|
if self._settings.resume == "must":
|
809
809
|
error = wandb_internal_pb2.ErrorInfo()
|
810
810
|
error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
|
811
|
-
error.message = "resume='must' but could not resume (
|
811
|
+
error.message = "resume='must' but could not resume ({}) ".format(
|
812
|
+
run.run_id
|
813
|
+
)
|
812
814
|
return error
|
813
815
|
|
814
816
|
# TODO: Do we need to restore config / summary?
|
@@ -824,7 +826,7 @@ class SendManager:
|
|
824
826
|
self._resume_state.summary = summary
|
825
827
|
self._resume_state.tags = tags
|
826
828
|
self._resume_state.resumed = True
|
827
|
-
logger.info("configured resuming with:
|
829
|
+
logger.info("configured resuming with: {}".format(self._resume_state))
|
828
830
|
return None
|
829
831
|
|
830
832
|
def _telemetry_get_framework(self) -> str:
|
@@ -890,6 +892,36 @@ class SendManager:
|
|
890
892
|
self._run.forked = True
|
891
893
|
self._run.starting_step = first_step
|
892
894
|
|
895
|
+
def _load_rewind_state(self, run: "RunRecord"):
|
896
|
+
assert self._settings.resume_from
|
897
|
+
self._rewind_response = self._api.rewind_run(
|
898
|
+
run_name=run.run_id,
|
899
|
+
entity=run.entity or None,
|
900
|
+
project=run.project or None,
|
901
|
+
metric_name=self._settings.resume_from.metric,
|
902
|
+
metric_value=self._settings.resume_from.value,
|
903
|
+
program_path=self._settings.program or None,
|
904
|
+
)
|
905
|
+
self._resume_state.history = self._rewind_response.get("historyLineCount", 0)
|
906
|
+
self._resume_state.config = json.loads(
|
907
|
+
self._rewind_response.get("config", "{}")
|
908
|
+
)
|
909
|
+
|
910
|
+
def _install_rewind_state(self):
|
911
|
+
assert self._settings.resume_from
|
912
|
+
assert self._settings.resume_from.metric == "_step"
|
913
|
+
assert self._run
|
914
|
+
assert self._rewind_response
|
915
|
+
|
916
|
+
first_step = int(self._settings.resume_from.value) + 1
|
917
|
+
self._resume_state.step = first_step
|
918
|
+
|
919
|
+
# We set the fork flag here because rewind uses the forking
|
920
|
+
# infrastructure under the hood. Setting `forked` here
|
921
|
+
# ensures that run._step is properly set in the user process.
|
922
|
+
self._run.forked = True
|
923
|
+
self._run.starting_step = first_step
|
924
|
+
|
893
925
|
def _handle_error(
|
894
926
|
self,
|
895
927
|
record: "Record",
|
@@ -926,13 +958,16 @@ class SendManager:
|
|
926
958
|
self._config_save(config_value_dict)
|
927
959
|
|
928
960
|
do_fork = self._settings.fork_from is not None and is_wandb_init
|
961
|
+
do_rewind = self._settings.resume_from is not None and is_wandb_init
|
929
962
|
do_resume = bool(self._settings.resume)
|
930
963
|
|
931
|
-
|
964
|
+
num_resume_options_set = sum([do_fork, do_rewind, do_resume])
|
965
|
+
if num_resume_options_set > 1:
|
932
966
|
error = wandb_internal_pb2.ErrorInfo()
|
933
967
|
error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
|
934
968
|
error.message = (
|
935
|
-
"
|
969
|
+
"Multiple resume options specified. "
|
970
|
+
"Please specify only one of `fork_from`, `resume`, or `resume_from`."
|
936
971
|
)
|
937
972
|
self._handle_error(record, error, run)
|
938
973
|
|
@@ -945,6 +980,9 @@ class SendManager:
|
|
945
980
|
if do_resume:
|
946
981
|
error = self._setup_resume(run)
|
947
982
|
|
983
|
+
elif do_rewind:
|
984
|
+
error = self._load_rewind_state(run)
|
985
|
+
|
948
986
|
if error is not None:
|
949
987
|
self._handle_error(record, error, run)
|
950
988
|
return
|
@@ -996,6 +1034,26 @@ class SendManager:
|
|
996
1034
|
else:
|
997
1035
|
logger.info("updated run: %s", self._run.run_id)
|
998
1036
|
|
1037
|
+
def _update_resume_state(self, is_rewinding: bool, inserted: bool):
|
1038
|
+
assert self._run
|
1039
|
+
if self._resume_state.resumed:
|
1040
|
+
self._run.resumed = True
|
1041
|
+
if self._resume_state.wandb_runtime is not None:
|
1042
|
+
self._run.runtime = self._resume_state.wandb_runtime
|
1043
|
+
elif is_rewinding:
|
1044
|
+
# because is_rewinding is mutually exclusive with self._resume_state.resumed,
|
1045
|
+
# this block will always execute if is_rewinding is set
|
1046
|
+
self._install_rewind_state()
|
1047
|
+
else:
|
1048
|
+
# If the user is not resuming, and we didn't insert on upsert_run then
|
1049
|
+
# it is likely that we are overwriting the run which we might want to
|
1050
|
+
# prevent in the future. This could be a false signal since an upsert_run
|
1051
|
+
# message which gets retried in the network could also show up as not
|
1052
|
+
# inserted.
|
1053
|
+
if not inserted:
|
1054
|
+
# no need to flush this, it will get updated eventually
|
1055
|
+
self._telemetry_obj.feature.maybe_run_overwrite = True
|
1056
|
+
|
999
1057
|
def _init_run(
|
1000
1058
|
self,
|
1001
1059
|
run: "RunRecord",
|
@@ -1011,22 +1069,30 @@ class SendManager:
|
|
1011
1069
|
if self._resume_state and self._resume_state.tags and not run.tags:
|
1012
1070
|
run.tags.extend(self._resume_state.tags)
|
1013
1071
|
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1072
|
+
is_rewinding = bool(self._settings.resume_from)
|
1073
|
+
if is_rewinding:
|
1074
|
+
assert self._rewind_response
|
1075
|
+
server_run = self._rewind_response
|
1076
|
+
server_messages = None
|
1077
|
+
inserted = True
|
1078
|
+
else:
|
1079
|
+
server_run, inserted, server_messages = self._api.upsert_run(
|
1080
|
+
name=run.run_id,
|
1081
|
+
entity=run.entity or None,
|
1082
|
+
project=run.project or None,
|
1083
|
+
group=run.run_group or None,
|
1084
|
+
job_type=run.job_type or None,
|
1085
|
+
display_name=run.display_name or None,
|
1086
|
+
notes=run.notes or None,
|
1087
|
+
tags=run.tags[:] or None,
|
1088
|
+
config=config_dict or None,
|
1089
|
+
sweep_name=run.sweep_id or None,
|
1090
|
+
host=run.host or None,
|
1091
|
+
program_path=self._settings.program or None,
|
1092
|
+
repo=run.git.remote_url or None,
|
1093
|
+
commit=run.git.commit or None,
|
1094
|
+
)
|
1095
|
+
|
1030
1096
|
# TODO: we don't want to create jobs in sweeps, since the
|
1031
1097
|
# executable doesn't appear to be consistent
|
1032
1098
|
if run.sweep_id:
|
@@ -1034,19 +1100,17 @@ class SendManager:
|
|
1034
1100
|
|
1035
1101
|
self._server_messages = server_messages or []
|
1036
1102
|
self._run = run
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
# no need to flush this, it will get updated eventually
|
1049
|
-
self._telemetry_obj.feature.maybe_run_overwrite = True
|
1103
|
+
|
1104
|
+
if self._resume_state.resumed and is_rewinding:
|
1105
|
+
# this should not ever be possible to hit, since we check for
|
1106
|
+
# resumption above and raise an error if resumption is specified
|
1107
|
+
# twice.
|
1108
|
+
raise ValueError(
|
1109
|
+
"Cannot attempt to rewind and resume a run - only one of "
|
1110
|
+
"`resume` or `resume_from` can be specified."
|
1111
|
+
)
|
1112
|
+
|
1113
|
+
self._update_resume_state(is_rewinding, inserted)
|
1050
1114
|
self._run.starting_step = self._resume_state.step
|
1051
1115
|
self._run.start_time.FromMicroseconds(int(start_time * 1e6))
|
1052
1116
|
self._run.config.CopyFrom(self._interface._make_config(config_dict))
|
@@ -1439,9 +1503,7 @@ class SendManager:
|
|
1439
1503
|
self._job_builder.disable = True
|
1440
1504
|
elif use.partial.job_name:
|
1441
1505
|
# job is partial, let job builder rebuild job, set job source dict
|
1442
|
-
self._job_builder.
|
1443
|
-
job_builder.convert_use_artifact_to_job_source(record.use_artifact)
|
1444
|
-
)
|
1506
|
+
self._job_builder.set_partial_source_id(use.id)
|
1445
1507
|
|
1446
1508
|
def send_request_log_artifact(self, record: "Record") -> None:
|
1447
1509
|
assert record.control.req_resp
|
@@ -1617,7 +1679,8 @@ class SendManager:
|
|
1617
1679
|
summary_dict = self._cached_summary.copy()
|
1618
1680
|
summary_dict.pop("_wandb", None)
|
1619
1681
|
self._job_builder.set_summary(summary_dict)
|
1620
|
-
|
1682
|
+
|
1683
|
+
artifact = self._job_builder.build(api=self._api)
|
1621
1684
|
if artifact is not None and self._run is not None:
|
1622
1685
|
proto_artifact = self._interface._make_artifact(artifact)
|
1623
1686
|
proto_artifact.run_id = self._run.run_id
|
@@ -18,6 +18,7 @@ class SettingsStatic(SettingsData):
|
|
18
18
|
object.__setattr__(self, "_proto", proto)
|
19
19
|
|
20
20
|
def _from_proto(self, proto: wandb_settings_pb2.Settings) -> None:
|
21
|
+
forks_specified: list[str] = []
|
21
22
|
for field in fields(SettingsData):
|
22
23
|
key = field.name
|
23
24
|
value: Any = None
|
@@ -39,12 +40,13 @@ class SettingsStatic(SettingsData):
|
|
39
40
|
unpacked_inner[inner_key] = inner_value
|
40
41
|
unpacked_mapping[outer_key] = unpacked_inner
|
41
42
|
value = unpacked_mapping
|
42
|
-
elif key == "fork_from":
|
43
|
+
elif key == "fork_from" or key == "resume_from":
|
43
44
|
value = getattr(proto, key)
|
44
45
|
if value.run:
|
45
46
|
value = RunMoment(
|
46
47
|
run=value.run, value=value.value, metric=value.metric
|
47
48
|
)
|
49
|
+
forks_specified.append(key)
|
48
50
|
else:
|
49
51
|
value = None
|
50
52
|
else:
|
@@ -58,6 +60,11 @@ class SettingsStatic(SettingsData):
|
|
58
60
|
value = None
|
59
61
|
object.__setattr__(self, key, value)
|
60
62
|
|
63
|
+
if len(forks_specified) > 1:
|
64
|
+
raise ValueError(
|
65
|
+
"Only one of fork_from or resume_from can be specified, not both"
|
66
|
+
)
|
67
|
+
|
61
68
|
def __setattr__(self, name: str, value: object) -> None:
|
62
69
|
raise AttributeError("Error: SettingsStatic is a readonly object")
|
63
70
|
|
@@ -133,7 +133,7 @@ class NeuronCoreStats:
|
|
133
133
|
process.kill()
|
134
134
|
process.wait()
|
135
135
|
except Exception as e:
|
136
|
-
logger.error("neuron-monitor failed:
|
136
|
+
logger.error("neuron-monitor failed: {}".format(e))
|
137
137
|
|
138
138
|
def __init__(
|
139
139
|
self,
|
@@ -175,7 +175,7 @@ class NeuronCoreStats:
|
|
175
175
|
assert self.neuron_monitor_thread is not None
|
176
176
|
self.neuron_monitor_thread.join()
|
177
177
|
except Exception as e:
|
178
|
-
logger.error("neuron-monitor thread failed to stop:
|
178
|
+
logger.error("neuron-monitor thread failed to stop: {}".format(e))
|
179
179
|
finally:
|
180
180
|
self.neuron_monitor_thread = None
|
181
181
|
|
@@ -394,5 +394,5 @@ class Trainium:
|
|
394
394
|
|
395
395
|
return {self.name: neuron_hardware_info}
|
396
396
|
except Exception as e:
|
397
|
-
logger.error("neuron-monitor failed:
|
397
|
+
logger.error("neuron-monitor failed: {}".format(e))
|
398
398
|
return {}
|
@@ -55,7 +55,9 @@ class SystemInfo:
|
|
55
55
|
)
|
56
56
|
program_absolute = os.path.join(root, program_relative)
|
57
57
|
if not os.path.exists(program_absolute):
|
58
|
-
logger.warning(
|
58
|
+
logger.warning(
|
59
|
+
"unable to save code -- can't find {}".format(program_absolute)
|
60
|
+
)
|
59
61
|
return None
|
60
62
|
saved_program = os.path.join(self.settings.files_dir, "code", program_relative)
|
61
63
|
self.saved_program = program_relative # type: ignore
|
@@ -120,7 +122,7 @@ class SystemInfo:
|
|
120
122
|
subprocess.CalledProcessError,
|
121
123
|
subprocess.TimeoutExpired,
|
122
124
|
) as e:
|
123
|
-
logger.error("Error generating diff:
|
125
|
+
logger.error("Error generating diff: {}".format(e))
|
124
126
|
logger.debug("Saving git patches done")
|
125
127
|
|
126
128
|
def _probe_git(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
wandb/sdk/internal/update.py
CHANGED
@@ -98,7 +98,7 @@ def check_available(current_version: str) -> Optional[Dict[str, Optional[str]]]:
|
|
98
98
|
)
|
99
99
|
yank_message = None
|
100
100
|
if yanked:
|
101
|
-
reason_message = "(
|
101
|
+
reason_message = "({}) ".format(yanked_reason) if yanked_reason else ""
|
102
102
|
yank_message = "{} version {} has been recalled! {}Please upgrade.".format(
|
103
103
|
wandb_module_name,
|
104
104
|
current_version,
|
wandb/sdk/launch/__init__.py
CHANGED
@@ -1,6 +1,14 @@
|
|
1
1
|
from ._launch import launch
|
2
2
|
from ._launch_add import launch_add
|
3
3
|
from .agent.agent import LaunchAgent
|
4
|
+
from .inputs.manage import manage_config_file, manage_wandb_config
|
4
5
|
from .utils import load_wandb_config
|
5
6
|
|
6
|
-
__all__ = [
|
7
|
+
__all__ = [
|
8
|
+
"LaunchAgent",
|
9
|
+
"launch",
|
10
|
+
"launch_add",
|
11
|
+
"load_wandb_config",
|
12
|
+
"manage_config_file",
|
13
|
+
"manage_wandb_config",
|
14
|
+
]
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -12,13 +12,12 @@ from wandb.apis.internal import Api
|
|
12
12
|
from . import loader
|
13
13
|
from ._project_spec import LaunchProject
|
14
14
|
from .agent import LaunchAgent
|
15
|
-
from .
|
15
|
+
from .agent.agent import construct_agent_configs
|
16
16
|
from .environment.local_environment import LocalEnvironment
|
17
17
|
from .errors import ExecutionError, LaunchError
|
18
18
|
from .runner.abstract import AbstractRun
|
19
19
|
from .utils import (
|
20
20
|
LAUNCH_CONFIG_FILE,
|
21
|
-
LAUNCH_DEFAULT_PROJECT,
|
22
21
|
PROJECT_SYNCHRONOUS,
|
23
22
|
construct_launch_spec,
|
24
23
|
validate_launch_spec_source,
|
@@ -58,7 +57,6 @@ def set_launch_logfile(logfile: str) -> None:
|
|
58
57
|
|
59
58
|
def resolve_agent_config( # noqa: C901
|
60
59
|
entity: Optional[str],
|
61
|
-
project: Optional[str],
|
62
60
|
max_jobs: Optional[int],
|
63
61
|
queues: Optional[Tuple[str]],
|
64
62
|
config: Optional[str],
|
@@ -69,7 +67,6 @@ def resolve_agent_config( # noqa: C901
|
|
69
67
|
Arguments:
|
70
68
|
api (Api): The api.
|
71
69
|
entity (str): The entity.
|
72
|
-
project (str): The project.
|
73
70
|
max_jobs (int): The max number of jobs.
|
74
71
|
queues (Tuple[str]): The queues.
|
75
72
|
config (str): The config.
|
@@ -79,7 +76,6 @@ def resolve_agent_config( # noqa: C901
|
|
79
76
|
Tuple[Dict[str, Any], Api]: The resolved config and api.
|
80
77
|
"""
|
81
78
|
defaults = {
|
82
|
-
"project": LAUNCH_DEFAULT_PROJECT,
|
83
79
|
"max_jobs": 1,
|
84
80
|
"max_schedulers": 1,
|
85
81
|
"queues": [],
|
@@ -87,7 +83,6 @@ def resolve_agent_config( # noqa: C901
|
|
87
83
|
"builder": {},
|
88
84
|
"verbosity": 0,
|
89
85
|
}
|
90
|
-
user_set_project = False
|
91
86
|
resolved_config: Dict[str, Any] = defaults
|
92
87
|
config_path = config or os.path.expanduser(LAUNCH_CONFIG_FILE)
|
93
88
|
if os.path.isfile(config_path):
|
@@ -100,16 +95,11 @@ def resolve_agent_config( # noqa: C901
|
|
100
95
|
launch_config = {} # type: ignore
|
101
96
|
except yaml.YAMLError as e:
|
102
97
|
raise LaunchError(f"Invalid launch agent config: {e}")
|
103
|
-
if launch_config.get("project") is not None:
|
104
|
-
user_set_project = True
|
105
98
|
resolved_config.update(launch_config.items())
|
106
99
|
elif config is not None:
|
107
100
|
raise LaunchError(
|
108
101
|
f"Could not find use specified launch config file: {config_path}"
|
109
102
|
)
|
110
|
-
if os.environ.get("WANDB_PROJECT") is not None:
|
111
|
-
resolved_config.update({"project": os.environ.get("WANDB_PROJECT")})
|
112
|
-
user_set_project = True
|
113
103
|
if os.environ.get("WANDB_ENTITY") is not None:
|
114
104
|
resolved_config.update({"entity": os.environ.get("WANDB_ENTITY")})
|
115
105
|
if os.environ.get("WANDB_LAUNCH_MAX_JOBS") is not None:
|
@@ -117,9 +107,6 @@ def resolve_agent_config( # noqa: C901
|
|
117
107
|
{"max_jobs": int(os.environ.get("WANDB_LAUNCH_MAX_JOBS", 1))}
|
118
108
|
)
|
119
109
|
|
120
|
-
if project is not None:
|
121
|
-
resolved_config.update({"project": project})
|
122
|
-
user_set_project = True
|
123
110
|
if entity is not None:
|
124
111
|
resolved_config.update({"entity": entity})
|
125
112
|
if max_jobs is not None:
|
@@ -138,7 +125,7 @@ def resolve_agent_config( # noqa: C901
|
|
138
125
|
+ " (expected str). Specify multiple queues with the 'queues' key"
|
139
126
|
)
|
140
127
|
|
141
|
-
keys = ["
|
128
|
+
keys = ["entity"]
|
142
129
|
settings = {
|
143
130
|
k: resolved_config.get(k) for k in keys if resolved_config.get(k) is not None
|
144
131
|
}
|
@@ -147,10 +134,6 @@ def resolve_agent_config( # noqa: C901
|
|
147
134
|
|
148
135
|
if resolved_config.get("entity") is None:
|
149
136
|
resolved_config.update({"entity": api.default_entity})
|
150
|
-
if user_set_project:
|
151
|
-
wandb.termwarn(
|
152
|
-
"Specifying a project for the launch agent is deprecated. Please use queues found in the Launch application at https://wandb.ai/launch."
|
153
|
-
)
|
154
137
|
|
155
138
|
return resolved_config, api
|
156
139
|
|
@@ -188,7 +171,6 @@ def create_and_run_agent(
|
|
188
171
|
|
189
172
|
async def _launch(
|
190
173
|
api: Api,
|
191
|
-
uri: Optional[str] = None,
|
192
174
|
job: Optional[str] = None,
|
193
175
|
name: Optional[str] = None,
|
194
176
|
project: Optional[str] = None,
|
@@ -209,7 +191,7 @@ async def _launch(
|
|
209
191
|
if resource is None:
|
210
192
|
resource = "local-container"
|
211
193
|
launch_spec = construct_launch_spec(
|
212
|
-
|
194
|
+
None,
|
213
195
|
job,
|
214
196
|
api,
|
215
197
|
name,
|
@@ -228,7 +210,7 @@ async def _launch(
|
|
228
210
|
validate_launch_spec_source(launch_spec)
|
229
211
|
launch_project = LaunchProject.from_spec(launch_spec, api)
|
230
212
|
launch_project.fetch_and_validate_project()
|
231
|
-
entrypoint = launch_project.
|
213
|
+
entrypoint = launch_project.get_job_entry_point()
|
232
214
|
image_uri = launch_project.docker_image # Either set by user or None.
|
233
215
|
|
234
216
|
# construct runner config.
|
@@ -326,8 +308,6 @@ def launch(
|
|
326
308
|
"""
|
327
309
|
submitted_run_obj = asyncio.run(
|
328
310
|
_launch(
|
329
|
-
# TODO: fully deprecate URI path
|
330
|
-
uri=None,
|
331
311
|
job=job,
|
332
312
|
name=name,
|
333
313
|
project=project,
|
wandb/sdk/launch/_launch_add.py
CHANGED
@@ -109,7 +109,6 @@ def launch_add(
|
|
109
109
|
|
110
110
|
return _launch_add(
|
111
111
|
api,
|
112
|
-
uri,
|
113
112
|
job,
|
114
113
|
config,
|
115
114
|
template_variables,
|
@@ -134,7 +133,6 @@ def launch_add(
|
|
134
133
|
|
135
134
|
def _launch_add(
|
136
135
|
api: Api,
|
137
|
-
uri: Optional[str],
|
138
136
|
job: Optional[str],
|
139
137
|
config: Optional[Dict[str, Any]],
|
140
138
|
template_variables: Optional[dict],
|
@@ -156,7 +154,7 @@ def _launch_add(
|
|
156
154
|
priority: Optional[int] = None,
|
157
155
|
) -> "public.QueuedRun":
|
158
156
|
launch_spec = construct_launch_spec(
|
159
|
-
|
157
|
+
None,
|
160
158
|
job,
|
161
159
|
api,
|
162
160
|
name,
|