wandb 0.19.12rc1__py3-none-win32.whl → 0.20.1__py3-none-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +1 -2
- wandb/__init__.pyi +3 -6
- wandb/_iterutils.py +26 -7
- wandb/_pydantic/__init__.py +2 -1
- wandb/_pydantic/utils.py +7 -0
- wandb/agents/pyagent.py +9 -15
- wandb/analytics/sentry.py +1 -2
- wandb/apis/attrs.py +3 -4
- wandb/apis/importers/internals/util.py +1 -1
- wandb/apis/importers/validation.py +2 -2
- wandb/apis/importers/wandb.py +30 -25
- wandb/apis/normalize.py +2 -2
- wandb/apis/public/__init__.py +1 -0
- wandb/apis/public/api.py +37 -33
- wandb/apis/public/artifacts.py +103 -72
- wandb/apis/public/jobs.py +3 -2
- wandb/apis/public/registries/registries_search.py +4 -2
- wandb/apis/public/registries/registry.py +1 -1
- wandb/apis/public/registries/utils.py +9 -9
- wandb/apis/public/runs.py +18 -6
- wandb/automations/_filters/expressions.py +1 -1
- wandb/automations/_filters/operators.py +1 -1
- wandb/automations/_filters/run_metrics.py +1 -1
- wandb/beta/workflows.py +6 -5
- wandb/bin/gpu_stats.exe +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/cli.py +54 -73
- wandb/docker/__init__.py +21 -74
- wandb/docker/names.py +40 -0
- wandb/env.py +0 -1
- wandb/errors/util.py +1 -1
- wandb/filesync/step_checksum.py +1 -1
- wandb/filesync/step_upload.py +1 -1
- wandb/integration/diffusers/resolvers/multimodal.py +1 -2
- wandb/integration/gym/__init__.py +5 -6
- wandb/integration/keras/callbacks/model_checkpoint.py +2 -2
- wandb/integration/keras/keras.py +13 -19
- wandb/integration/kfp/kfp_patch.py +2 -3
- wandb/integration/langchain/wandb_tracer.py +1 -1
- wandb/integration/metaflow/metaflow.py +13 -13
- wandb/integration/openai/fine_tuning.py +3 -2
- wandb/integration/sagemaker/auth.py +2 -1
- wandb/integration/sklearn/utils.py +2 -1
- wandb/integration/tensorboard/__init__.py +1 -1
- wandb/integration/tensorboard/log.py +2 -5
- wandb/integration/tensorflow/__init__.py +2 -2
- wandb/jupyter.py +20 -17
- wandb/plot/confusion_matrix.py +1 -1
- wandb/plot/utils.py +8 -7
- wandb/proto/v3/wandb_internal_pb2.py +355 -335
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +12 -12
- wandb/proto/v4/wandb_internal_pb2.py +339 -335
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +12 -12
- wandb/proto/v5/wandb_internal_pb2.py +339 -335
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +12 -12
- wandb/proto/v6/wandb_internal_pb2.py +339 -335
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +12 -12
- wandb/proto/wandb_deprecated.py +6 -8
- wandb/sdk/artifacts/_internal_artifact.py +43 -0
- wandb/sdk/artifacts/_validators.py +55 -35
- wandb/sdk/artifacts/artifact.py +117 -115
- wandb/sdk/artifacts/artifact_download_logger.py +2 -0
- wandb/sdk/artifacts/artifact_saver.py +1 -3
- wandb/sdk/artifacts/artifact_state.py +2 -0
- wandb/sdk/artifacts/artifact_ttl.py +2 -0
- wandb/sdk/artifacts/exceptions.py +14 -0
- wandb/sdk/artifacts/staging.py +2 -0
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +2 -6
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -1
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +2 -6
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +1 -5
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -1
- wandb/sdk/artifacts/storage_layout.py +2 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +3 -3
- wandb/sdk/backend/backend.py +11 -182
- wandb/sdk/data_types/_dtypes.py +2 -6
- wandb/sdk/data_types/audio.py +20 -3
- wandb/sdk/data_types/base_types/media.py +12 -7
- wandb/sdk/data_types/base_types/wb_value.py +8 -18
- wandb/sdk/data_types/bokeh.py +19 -2
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +17 -1
- wandb/sdk/data_types/helper_types/image_mask.py +7 -1
- wandb/sdk/data_types/html.py +4 -4
- wandb/sdk/data_types/image.py +178 -103
- wandb/sdk/data_types/molecule.py +6 -6
- wandb/sdk/data_types/object_3d.py +10 -5
- wandb/sdk/data_types/saved_model.py +11 -6
- wandb/sdk/data_types/table.py +313 -83
- wandb/sdk/data_types/table_decorators.py +108 -0
- wandb/sdk/data_types/utils.py +43 -7
- wandb/sdk/data_types/video.py +21 -3
- wandb/sdk/interface/interface.py +10 -0
- wandb/sdk/internal/datastore.py +2 -6
- wandb/sdk/internal/file_pusher.py +1 -5
- wandb/sdk/internal/file_stream.py +8 -17
- wandb/sdk/internal/handler.py +2 -2
- wandb/sdk/internal/incremental_table_util.py +53 -0
- wandb/sdk/internal/internal.py +3 -5
- wandb/sdk/internal/internal_api.py +66 -89
- wandb/sdk/internal/job_builder.py +2 -7
- wandb/sdk/internal/profiler.py +2 -2
- wandb/sdk/internal/progress.py +1 -3
- wandb/sdk/internal/run.py +1 -6
- wandb/sdk/internal/sender.py +24 -36
- wandb/sdk/internal/system/assets/aggregators.py +1 -7
- wandb/sdk/internal/system/assets/disk.py +3 -3
- wandb/sdk/internal/system/assets/gpu.py +4 -4
- wandb/sdk/internal/system/assets/gpu_amd.py +4 -4
- wandb/sdk/internal/system/assets/interfaces.py +6 -6
- wandb/sdk/internal/system/assets/tpu.py +1 -1
- wandb/sdk/internal/system/assets/trainium.py +6 -6
- wandb/sdk/internal/system/system_info.py +5 -7
- wandb/sdk/internal/system/system_monitor.py +4 -4
- wandb/sdk/internal/tb_watcher.py +5 -7
- wandb/sdk/launch/_launch.py +1 -1
- wandb/sdk/launch/_project_spec.py +19 -20
- wandb/sdk/launch/agent/agent.py +3 -3
- wandb/sdk/launch/agent/config.py +1 -1
- wandb/sdk/launch/agent/job_status_tracker.py +2 -2
- wandb/sdk/launch/builder/build.py +2 -3
- wandb/sdk/launch/builder/kaniko_builder.py +5 -4
- wandb/sdk/launch/environment/gcp_environment.py +1 -2
- wandb/sdk/launch/registry/azure_container_registry.py +2 -2
- wandb/sdk/launch/registry/elastic_container_registry.py +2 -2
- wandb/sdk/launch/registry/google_artifact_registry.py +3 -3
- wandb/sdk/launch/runner/abstract.py +5 -5
- wandb/sdk/launch/runner/kubernetes_monitor.py +2 -2
- wandb/sdk/launch/runner/kubernetes_runner.py +1 -1
- wandb/sdk/launch/runner/sagemaker_runner.py +2 -4
- wandb/sdk/launch/runner/vertex_runner.py +2 -7
- wandb/sdk/launch/sweeps/__init__.py +1 -1
- wandb/sdk/launch/sweeps/scheduler.py +2 -2
- wandb/sdk/launch/sweeps/utils.py +3 -3
- wandb/sdk/launch/utils.py +3 -4
- wandb/sdk/lib/apikey.py +5 -8
- wandb/sdk/lib/config_util.py +3 -3
- wandb/sdk/lib/fsm.py +3 -18
- wandb/sdk/lib/gitlib.py +6 -5
- wandb/sdk/lib/ipython.py +2 -2
- wandb/sdk/lib/json_util.py +9 -14
- wandb/sdk/lib/printer.py +3 -8
- wandb/sdk/lib/redirect.py +1 -1
- wandb/sdk/lib/retry.py +3 -7
- wandb/sdk/lib/run_moment.py +2 -2
- wandb/sdk/lib/service_connection.py +3 -1
- wandb/sdk/lib/service_token.py +1 -2
- wandb/sdk/mailbox/mailbox_handle.py +3 -7
- wandb/sdk/mailbox/response_handle.py +2 -6
- wandb/sdk/service/streams.py +3 -7
- wandb/sdk/verify/verify.py +5 -6
- wandb/sdk/wandb_config.py +1 -1
- wandb/sdk/wandb_init.py +38 -106
- wandb/sdk/wandb_login.py +7 -6
- wandb/sdk/wandb_run.py +52 -240
- wandb/sdk/wandb_settings.py +71 -60
- wandb/sdk/wandb_setup.py +40 -14
- wandb/sdk/wandb_watch.py +5 -7
- wandb/sync/__init__.py +1 -1
- wandb/sync/sync.py +13 -13
- wandb/util.py +17 -35
- wandb/wandb_agent.py +8 -11
- {wandb-0.19.12rc1.dist-info → wandb-0.20.1.dist-info}/METADATA +5 -5
- {wandb-0.19.12rc1.dist-info → wandb-0.20.1.dist-info}/RECORD +170 -168
- wandb/docker/auth.py +0 -435
- wandb/docker/www_authenticate.py +0 -94
- {wandb-0.19.12rc1.dist-info → wandb-0.20.1.dist-info}/WHEEL +0 -0
- {wandb-0.19.12rc1.dist-info → wandb-0.20.1.dist-info}/entry_points.txt +0 -0
- {wandb-0.19.12rc1.dist-info → wandb-0.20.1.dist-info}/licenses/LICENSE +0 -0
@@ -126,8 +126,8 @@ class NeuronCoreStats:
|
|
126
126
|
self.raw_samples.append(raw_data)
|
127
127
|
process.kill()
|
128
128
|
process.wait()
|
129
|
-
except Exception
|
130
|
-
logger.
|
129
|
+
except Exception:
|
130
|
+
logger.exception("neuron-monitor failed")
|
131
131
|
|
132
132
|
def __init__(
|
133
133
|
self,
|
@@ -168,8 +168,8 @@ class NeuronCoreStats:
|
|
168
168
|
self.shutdown_event.set()
|
169
169
|
assert self.neuron_monitor_thread is not None
|
170
170
|
self.neuron_monitor_thread.join()
|
171
|
-
except Exception
|
172
|
-
logger.
|
171
|
+
except Exception:
|
172
|
+
logger.exception("neuron-monitor thread failed to stop")
|
173
173
|
finally:
|
174
174
|
self.neuron_monitor_thread = None
|
175
175
|
|
@@ -388,6 +388,6 @@ class Trainium:
|
|
388
388
|
pass
|
389
389
|
|
390
390
|
return {self.name: neuron_hardware_info}
|
391
|
-
except Exception
|
392
|
-
logger.
|
391
|
+
except Exception:
|
392
|
+
logger.exception("neuron-monitor failed")
|
393
393
|
return {}
|
@@ -55,9 +55,7 @@ class SystemInfo:
|
|
55
55
|
)
|
56
56
|
program_absolute = os.path.join(root, program_relative)
|
57
57
|
if not os.path.exists(program_absolute):
|
58
|
-
logger.warning(
|
59
|
-
"unable to save code -- can't find {}".format(program_absolute)
|
60
|
-
)
|
58
|
+
logger.warning(f"unable to save code -- can't find {program_absolute}")
|
61
59
|
return None
|
62
60
|
saved_program = os.path.join(self.settings.files_dir, "code", program_relative)
|
63
61
|
self.saved_program = program_relative # type: ignore
|
@@ -121,8 +119,8 @@ class SystemInfo:
|
|
121
119
|
ValueError,
|
122
120
|
subprocess.CalledProcessError,
|
123
121
|
subprocess.TimeoutExpired,
|
124
|
-
)
|
125
|
-
logger.
|
122
|
+
):
|
123
|
+
logger.exception("Error generating diff.")
|
126
124
|
logger.debug("Saving git patches done")
|
127
125
|
|
128
126
|
def _probe_git(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
@@ -220,8 +218,8 @@ class SystemInfo:
|
|
220
218
|
stderr=subprocess.DEVNULL,
|
221
219
|
timeout=15, # add timeout since conda env export could take a really long time
|
222
220
|
)
|
223
|
-
except Exception
|
224
|
-
logger.exception(
|
221
|
+
except Exception:
|
222
|
+
logger.exception("Error saving conda packages")
|
225
223
|
logger.debug("Saving conda packages done")
|
226
224
|
|
227
225
|
def publish(self, system_info: dict) -> None:
|
@@ -177,8 +177,8 @@ class SystemMonitor:
|
|
177
177
|
# publish telemetry
|
178
178
|
self.publish_telemetry()
|
179
179
|
self.aggregate_and_publish_asset_metrics()
|
180
|
-
except Exception
|
181
|
-
logger.
|
180
|
+
except Exception:
|
181
|
+
logger.exception("Error publishing last batch of metrics.")
|
182
182
|
|
183
183
|
def start(self) -> None:
|
184
184
|
self._shutdown_event.clear()
|
@@ -199,8 +199,8 @@ class SystemMonitor:
|
|
199
199
|
asset.finish()
|
200
200
|
try:
|
201
201
|
self._process.join()
|
202
|
-
except Exception
|
203
|
-
logger.
|
202
|
+
except Exception:
|
203
|
+
logger.exception("Error joining system monitor process.")
|
204
204
|
self._process = None
|
205
205
|
|
206
206
|
def probe(self, publish: bool = True) -> dict:
|
wandb/sdk/internal/tb_watcher.py
CHANGED
@@ -288,9 +288,9 @@ class TBDirWatcher:
|
|
288
288
|
def _thread_except_body(self) -> None:
|
289
289
|
try:
|
290
290
|
self._thread_body()
|
291
|
-
except Exception
|
291
|
+
except Exception:
|
292
292
|
logger.exception("generic exception in TBDirWatcher thread")
|
293
|
-
raise
|
293
|
+
raise
|
294
294
|
|
295
295
|
def _thread_body(self) -> None:
|
296
296
|
"""Check for new events every second."""
|
@@ -394,9 +394,9 @@ class TBEventConsumer:
|
|
394
394
|
def _thread_except_body(self) -> None:
|
395
395
|
try:
|
396
396
|
self._thread_body()
|
397
|
-
except Exception
|
397
|
+
except Exception:
|
398
398
|
logger.exception("generic exception in TBEventConsumer thread")
|
399
|
-
raise
|
399
|
+
raise
|
400
400
|
|
401
401
|
def _thread_body(self) -> None:
|
402
402
|
while True:
|
@@ -490,9 +490,7 @@ class TBHistory:
|
|
490
490
|
dropped_keys.append(k)
|
491
491
|
del self._data[k]
|
492
492
|
wandb.termwarn(
|
493
|
-
"Step {} exceeds max data limit, dropping {} of the largest keys:"
|
494
|
-
self._step, len(dropped_keys)
|
495
|
-
)
|
493
|
+
f"Step {self._step} exceeds max data limit, dropping {len(dropped_keys)} of the largest keys:"
|
496
494
|
)
|
497
495
|
print("\t" + ("\n\t".join(dropped_keys))) # noqa: T201
|
498
496
|
self._data["_step"] = self._step
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -55,7 +55,7 @@ def set_launch_logfile(logfile: str) -> None:
|
|
55
55
|
_launch_logger.log(logging.INFO, "Internal agent logs printing to %s", logfile)
|
56
56
|
|
57
57
|
|
58
|
-
def resolve_agent_config(
|
58
|
+
def resolve_agent_config(
|
59
59
|
entity: Optional[str],
|
60
60
|
max_jobs: Optional[int],
|
61
61
|
queues: Optional[Tuple[str]],
|
@@ -7,13 +7,12 @@ import enum
|
|
7
7
|
import json
|
8
8
|
import logging
|
9
9
|
import os
|
10
|
+
import shlex
|
10
11
|
import shutil
|
11
12
|
import tempfile
|
12
13
|
from copy import deepcopy
|
13
14
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
|
14
15
|
|
15
|
-
from six.moves import shlex_quote
|
16
|
-
|
17
16
|
import wandb
|
18
17
|
from wandb.apis.internal import Api
|
19
18
|
from wandb.errors import CommError
|
@@ -121,7 +120,7 @@ class LaunchProject:
|
|
121
120
|
).get("base_image") or resource_args_build.get("cuda", {}).get("base_image")
|
122
121
|
self.docker_image: Optional[str] = docker_config.get(
|
123
122
|
"docker_image"
|
124
|
-
) or launch_spec.get("image_uri")
|
123
|
+
) or launch_spec.get("image_uri") # type: ignore [assignment]
|
125
124
|
self.docker_user_id = docker_config.get("user_id", 1000)
|
126
125
|
self._entry_point: Optional[EntryPoint] = (
|
127
126
|
None # todo: keep multiple entrypoint support?
|
@@ -215,7 +214,7 @@ class LaunchProject:
|
|
215
214
|
launch_spec.get("docker", {}),
|
216
215
|
launch_spec.get("git", {}),
|
217
216
|
launch_spec.get("overrides", {}),
|
218
|
-
launch_spec.get("resource", None),
|
217
|
+
launch_spec.get("resource", None), # type: ignore [arg-type]
|
219
218
|
launch_spec.get("resource_args", {}),
|
220
219
|
launch_spec.get("run_id", None),
|
221
220
|
launch_spec.get("sweep_id", {}),
|
@@ -487,30 +486,30 @@ class LaunchProject:
|
|
487
486
|
return env_vars
|
488
487
|
|
489
488
|
def parse_existing_requirements(self) -> str:
|
490
|
-
import
|
489
|
+
from packaging.requirements import InvalidRequirement, Requirement
|
491
490
|
|
492
491
|
requirements_line = ""
|
493
492
|
assert self.project_dir is not None
|
494
493
|
base_requirements = os.path.join(self.project_dir, "requirements.txt")
|
495
494
|
if os.path.exists(base_requirements):
|
496
495
|
include_only = set()
|
497
|
-
with open(base_requirements) as
|
498
|
-
|
499
|
-
|
496
|
+
with open(base_requirements) as f2:
|
497
|
+
for line in f2:
|
498
|
+
if line.strip() == "":
|
499
|
+
continue
|
500
|
+
|
500
501
|
try:
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
# just catch them all and ignore packages we can't parse
|
511
|
-
except Exception as e:
|
512
|
-
_logger.warn(f"Unable to parse requirements.txt: {e}")
|
502
|
+
req = Requirement(line)
|
503
|
+
name = req.name.lower()
|
504
|
+
include_only.add(shlex.quote(name))
|
505
|
+
except InvalidRequirement:
|
506
|
+
_logger.warning(
|
507
|
+
"Unable to parse line %s in requirements.txt",
|
508
|
+
line,
|
509
|
+
exc_info=True,
|
510
|
+
)
|
513
511
|
continue
|
512
|
+
|
514
513
|
requirements_line += "WANDB_ONLY_INCLUDE={} ".format(",".join(include_only))
|
515
514
|
if "wandb" not in requirements_line:
|
516
515
|
wandb.termwarn(f"{LOG_PREFIX}wandb is not present in requirements.txt.")
|
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -123,7 +123,7 @@ class InternalAgentLogger:
|
|
123
123
|
def warn(self, message: str):
|
124
124
|
if self._print_to_terminal:
|
125
125
|
wandb.termwarn(f"{LOG_PREFIX}{message}")
|
126
|
-
_logger.
|
126
|
+
_logger.warning(f"{LOG_PREFIX}{message}")
|
127
127
|
|
128
128
|
def info(self, message: str):
|
129
129
|
if self._print_to_terminal:
|
@@ -589,7 +589,7 @@ class LaunchAgent:
|
|
589
589
|
)
|
590
590
|
if agent_response["stopPolling"]:
|
591
591
|
# shutdown process and all jobs if requested from ui
|
592
|
-
raise KeyboardInterrupt
|
592
|
+
raise KeyboardInterrupt # noqa: TRY301
|
593
593
|
if self.num_running_jobs < self._max_jobs:
|
594
594
|
# only check for new jobs if we're not at max
|
595
595
|
job_and_queue = await self.get_job_and_queue()
|
@@ -850,7 +850,7 @@ class LaunchAgent:
|
|
850
850
|
)
|
851
851
|
return True
|
852
852
|
wandb.termlog(
|
853
|
-
f"{LOG_PREFIX}Run {job_tracker.run_id} was preempted,
|
853
|
+
f"{LOG_PREFIX}Run {job_tracker.run_id} was preempted, requeuing..."
|
854
854
|
)
|
855
855
|
|
856
856
|
if "sweep_id" in config:
|
wandb/sdk/launch/agent/config.py
CHANGED
@@ -48,6 +48,6 @@ class JobAndRunStatusTracker:
|
|
48
48
|
check_stop = event_loop_thread_exec(api.api.check_stop_requested)
|
49
49
|
try:
|
50
50
|
return bool(await check_stop(self.project, self.entity, self.run_id))
|
51
|
-
except CommError
|
52
|
-
_logger.
|
51
|
+
except CommError:
|
52
|
+
_logger.exception("CommError when checking if wandb run stopped")
|
53
53
|
return False
|
@@ -4,10 +4,9 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import pathlib
|
6
6
|
import shlex
|
7
|
+
import shutil
|
7
8
|
from typing import Any, Dict, List, Tuple
|
8
9
|
|
9
|
-
from dockerpycreds.utils import find_executable # type: ignore
|
10
|
-
|
11
10
|
import wandb
|
12
11
|
import wandb.env
|
13
12
|
from wandb import docker
|
@@ -38,7 +37,7 @@ _WANDB_DOCKERFILE_NAME = "Dockerfile.wandb"
|
|
38
37
|
|
39
38
|
async def validate_docker_installation() -> None:
|
40
39
|
"""Verify if Docker is installed on host machine."""
|
41
|
-
find_exec = event_loop_thread_exec(
|
40
|
+
find_exec = event_loop_thread_exec(shutil.which)
|
42
41
|
if not await find_exec("docker"):
|
43
42
|
raise ExecutionError(
|
44
43
|
"Could not find Docker executable. "
|
@@ -336,14 +336,15 @@ class KanikoBuilder(AbstractBuilder):
|
|
336
336
|
pod_name = get_pod_name_safe(k8s_job)
|
337
337
|
if pod_name:
|
338
338
|
msg += f" View logs with `kubectl logs -n {NAMESPACE} {pod_name}`."
|
339
|
-
raise Exception(msg)
|
339
|
+
raise Exception(msg) # noqa: TRY301
|
340
340
|
try:
|
341
341
|
pods_from_job = await core_v1.list_namespaced_pod(
|
342
342
|
namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
|
343
343
|
)
|
344
344
|
if len(pods_from_job.items) != 1:
|
345
|
-
raise Exception(
|
346
|
-
f"Expected 1 pod for job {build_job_name},
|
345
|
+
raise Exception( # noqa: TRY301
|
346
|
+
f"Expected 1 pod for job {build_job_name},"
|
347
|
+
f" found {len(pods_from_job.items)}"
|
347
348
|
)
|
348
349
|
pod_name = pods_from_job.items[0].metadata.name
|
349
350
|
logs = await core_v1.read_namespaced_pod_log(pod_name, NAMESPACE)
|
@@ -358,7 +359,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
358
359
|
wandb.termerror(
|
359
360
|
f"{LOG_PREFIX}Exception when creating Kubernetes resources: {e}\n"
|
360
361
|
)
|
361
|
-
raise
|
362
|
+
raise
|
362
363
|
finally:
|
363
364
|
wandb.termlog(f"{LOG_PREFIX}Cleaning up resources")
|
364
365
|
try:
|
@@ -94,8 +94,7 @@ class GcpEnvironment(AbstractEnvironment):
|
|
94
94
|
region = config.get("region", None)
|
95
95
|
if not region:
|
96
96
|
raise LaunchError(
|
97
|
-
"Could not create GcpEnvironment from config. Missing 'region' "
|
98
|
-
"field."
|
97
|
+
"Could not create GcpEnvironment from config. Missing 'region' field."
|
99
98
|
)
|
100
99
|
return cls(region=region)
|
101
100
|
|
@@ -15,12 +15,12 @@ if TYPE_CHECKING:
|
|
15
15
|
from azure.core.exceptions import ResourceNotFoundError # type: ignore
|
16
16
|
|
17
17
|
|
18
|
-
ContainerRegistryClient = get_module(
|
18
|
+
ContainerRegistryClient = get_module(
|
19
19
|
"azure.containerregistry",
|
20
20
|
required="The azure-containerregistry package is required to use launch with Azure. Please install it with `pip install azure-containerregistry`.",
|
21
21
|
).ContainerRegistryClient
|
22
22
|
|
23
|
-
ResourceNotFoundError = get_module(
|
23
|
+
ResourceNotFoundError = get_module(
|
24
24
|
"azure.core.exceptions",
|
25
25
|
required="The azure-core package is required to use launch with Azure. Please install it with `pip install azure-core`.",
|
26
26
|
).ResourceNotFoundError
|
@@ -14,11 +14,11 @@ from wandb.util import get_module
|
|
14
14
|
|
15
15
|
_logger = logging.getLogger(__name__)
|
16
16
|
|
17
|
-
botocore = get_module(
|
17
|
+
botocore = get_module(
|
18
18
|
"botocore",
|
19
19
|
required="The boto3 package is required to use launch with AWS. Please install it with `pip install wandb[launch]`.",
|
20
20
|
)
|
21
|
-
boto3 = get_module(
|
21
|
+
boto3 = get_module(
|
22
22
|
"boto3",
|
23
23
|
required="The boto3 package is required to use launch with AWS. Please install it with `pip install wandb[launch]`.",
|
24
24
|
)
|
@@ -17,16 +17,16 @@ from .abstract import AbstractRegistry
|
|
17
17
|
|
18
18
|
_logger = logging.getLogger(__name__)
|
19
19
|
|
20
|
-
google = get_module(
|
20
|
+
google = get_module(
|
21
21
|
"google",
|
22
22
|
required="The google package is required to use launch with Google. Please install it with `pip install wandb[launch]`.",
|
23
23
|
)
|
24
|
-
google.auth = get_module(
|
24
|
+
google.auth = get_module(
|
25
25
|
"google.auth",
|
26
26
|
required="The google-auth package is required to use launch with Google. Please install it with `pip install wandb[launch]`.",
|
27
27
|
)
|
28
28
|
|
29
|
-
google.cloud.artifactregistry = get_module(
|
29
|
+
google.cloud.artifactregistry = get_module(
|
30
30
|
"google.cloud.artifactregistry",
|
31
31
|
required="The google-cloud-artifactregistry package is required to use launch with Google. Please install it with `pip install wandb[launch]`.",
|
32
32
|
)
|
@@ -6,13 +6,12 @@ of runs launched in different environments (e.g. runs launched locally or in a c
|
|
6
6
|
|
7
7
|
import logging
|
8
8
|
import os
|
9
|
+
import shutil
|
9
10
|
import subprocess
|
10
11
|
import sys
|
11
12
|
from abc import ABC, abstractmethod
|
12
13
|
from typing import Any, Dict, List, Literal, Optional, Union
|
13
14
|
|
14
|
-
from dockerpycreds.utils import find_executable # type: ignore
|
15
|
-
|
16
15
|
import wandb
|
17
16
|
from wandb.apis.internal import Api
|
18
17
|
from wandb.sdk.lib import runid
|
@@ -145,10 +144,11 @@ class AbstractRunner(ABC):
|
|
145
144
|
self._namespace = runid.generate_id()
|
146
145
|
|
147
146
|
def find_executable(
|
148
|
-
self,
|
149
|
-
|
147
|
+
self,
|
148
|
+
cmd: str,
|
149
|
+
) -> Union[str, None]:
|
150
150
|
"""Cross platform utility for checking if a program is available."""
|
151
|
-
return
|
151
|
+
return shutil.which(cmd)
|
152
152
|
|
153
153
|
@property
|
154
154
|
def api_key(self) -> Any:
|
@@ -5,10 +5,10 @@ import logging
|
|
5
5
|
import traceback
|
6
6
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
7
7
|
|
8
|
-
import kubernetes_asyncio # type: ignore
|
8
|
+
import kubernetes_asyncio # type: ignore
|
9
9
|
import urllib3
|
10
10
|
from kubernetes_asyncio import watch
|
11
|
-
from kubernetes_asyncio.client import ( # type: ignore
|
11
|
+
from kubernetes_asyncio.client import ( # type: ignore
|
12
12
|
ApiException,
|
13
13
|
BatchV1Api,
|
14
14
|
CoreV1Api,
|
@@ -491,7 +491,7 @@ class KubernetesRunner(AbstractRunner):
|
|
491
491
|
|
492
492
|
async def run(
|
493
493
|
self, launch_project: LaunchProject, image_uri: str
|
494
|
-
) -> Optional[AbstractRun]:
|
494
|
+
) -> Optional[AbstractRun]:
|
495
495
|
"""Execute a launch project on Kubernetes.
|
496
496
|
|
497
497
|
Arguments:
|
@@ -69,7 +69,7 @@ class SagemakerSubmittedRun(AbstractRun):
|
|
69
69
|
)
|
70
70
|
assert "events" in res
|
71
71
|
return "\n".join(
|
72
|
-
[f
|
72
|
+
[f"{event['timestamp']}:{event['message']}" for event in res["events"]]
|
73
73
|
)
|
74
74
|
except self.log_client.exceptions.ResourceNotFoundException:
|
75
75
|
wandb.termwarn(
|
@@ -398,9 +398,7 @@ async def launch_sagemaker_job(
|
|
398
398
|
wandb.termlog(
|
399
399
|
f"{LOG_PREFIX}Run job submitted with arn: {resp.get('TrainingJobArn')}"
|
400
400
|
)
|
401
|
-
url = "https://{
|
402
|
-
region=sagemaker_client.meta.region_name, job_name=training_job_name
|
403
|
-
)
|
401
|
+
url = f"https://{sagemaker_client.meta.region_name}.console.aws.amazon.com/sagemaker/home?region={sagemaker_client.meta.region_name}#/jobs/{training_job_name}"
|
404
402
|
wandb.termlog(f"{LOG_PREFIX}See training job status at: {url}")
|
405
403
|
return run
|
406
404
|
|
@@ -49,12 +49,7 @@ class VertexSubmittedRun(AbstractRun):
|
|
49
49
|
return self._job.project # type: ignore
|
50
50
|
|
51
51
|
def get_page_link(self) -> str:
|
52
|
-
return "{
|
53
|
-
console_uri=GCP_CONSOLE_URI,
|
54
|
-
region=self.gcp_region,
|
55
|
-
job_id=self.id,
|
56
|
-
project=self.gcp_project,
|
57
|
-
)
|
52
|
+
return f"{GCP_CONSOLE_URI}/vertex-ai/locations/{self.gcp_region}/training/{self.id}?project={self.gcp_project}"
|
58
53
|
|
59
54
|
async def wait(self) -> bool:
|
60
55
|
# TODO: run this in a separate thread.
|
@@ -177,7 +172,7 @@ async def launch_vertex_job(
|
|
177
172
|
) -> VertexSubmittedRun:
|
178
173
|
try:
|
179
174
|
await environment.verify()
|
180
|
-
aiplatform = get_module(
|
175
|
+
aiplatform = get_module(
|
181
176
|
"google.cloud.aiplatform",
|
182
177
|
"VertexRunner requires google.cloud.aiplatform to be installed",
|
183
178
|
)
|
@@ -27,7 +27,7 @@ def load_scheduler(scheduler_type: str) -> Any:
|
|
27
27
|
f"{list(_WANDB_SCHEDULERS.keys())}, got: {scheduler_type}"
|
28
28
|
)
|
29
29
|
|
30
|
-
log.
|
30
|
+
log.warning(f"Loading dependencies for Scheduler of type: {scheduler_type}")
|
31
31
|
import_func = _WANDB_SCHEDULERS[scheduler_type]
|
32
32
|
return import_func()
|
33
33
|
|
@@ -354,7 +354,7 @@ class Scheduler(ABC):
|
|
354
354
|
wandb.termlog(f"{LOG_PREFIX}Scheduler failed with exception {e}")
|
355
355
|
self.state = SchedulerState.FAILED
|
356
356
|
self.exit()
|
357
|
-
raise
|
357
|
+
raise
|
358
358
|
else:
|
359
359
|
# scheduler succeeds if at runcap
|
360
360
|
if self.state == SchedulerState.FLUSH_RUNS and self.at_runcap:
|
@@ -699,7 +699,7 @@ class Scheduler(ABC):
|
|
699
699
|
if entry_point:
|
700
700
|
wandb.termwarn(
|
701
701
|
f"{LOG_PREFIX}Sweep command {entry_point} will override"
|
702
|
-
f
|
702
|
+
f" {'job' if _job else 'image_uri'} entrypoint"
|
703
703
|
)
|
704
704
|
|
705
705
|
# override resource and args of job
|
wandb/sdk/launch/sweeps/utils.py
CHANGED
@@ -199,7 +199,7 @@ def create_sweep_command(command: Optional[List] = None) -> List:
|
|
199
199
|
for m in matches[::-1]:
|
200
200
|
# Default to just leaving as is if environment variable does not exist
|
201
201
|
_var: str = os.environ.get(m.group(1), m.group(1))
|
202
|
-
command[i] = f"{command[i][:m.start()]}{_var}{command[i][m.end():]}"
|
202
|
+
command[i] = f"{command[i][: m.start()]}{_var}{command[i][m.end() :]}"
|
203
203
|
return command
|
204
204
|
|
205
205
|
|
@@ -211,7 +211,7 @@ def create_sweep_command_args(command: Dict) -> Dict[str, Any]:
|
|
211
211
|
|
212
212
|
"""
|
213
213
|
if "args" not in command:
|
214
|
-
raise ValueError('No "args" found in command: {}'
|
214
|
+
raise ValueError(f'No "args" found in command: {command}')
|
215
215
|
# four different formats of command args
|
216
216
|
# (1) standard command line flags (e.g. --foo=bar)
|
217
217
|
flags: List[str] = []
|
@@ -232,7 +232,7 @@ def create_sweep_command_args(command: Dict) -> Dict[str, Any]:
|
|
232
232
|
try:
|
233
233
|
_value: Any = config["value"]
|
234
234
|
except KeyError:
|
235
|
-
raise ValueError('No "value" found for command["args"]["{}"]'
|
235
|
+
raise ValueError(f'No "value" found for command["args"]["{param}"]')
|
236
236
|
|
237
237
|
_flag: str = f"{param}={_value}"
|
238
238
|
flags.append("--" + _flag)
|
wandb/sdk/launch/utils.py
CHANGED
@@ -323,8 +323,7 @@ def validate_launch_spec_source(launch_spec: Dict[str, Any]) -> None:
|
|
323
323
|
docker_image = launch_spec.get("docker", {}).get("docker_image")
|
324
324
|
if bool(job) == bool(docker_image):
|
325
325
|
raise LaunchError(
|
326
|
-
"Exactly one of job or docker_image must be specified in the launch "
|
327
|
-
"spec."
|
326
|
+
"Exactly one of job or docker_image must be specified in the launch spec."
|
328
327
|
)
|
329
328
|
|
330
329
|
|
@@ -628,9 +627,9 @@ def docker_image_exists(docker_image: str, should_raise: bool = False) -> bool:
|
|
628
627
|
try:
|
629
628
|
docker.run(["docker", "image", "inspect", docker_image])
|
630
629
|
return True
|
631
|
-
except (docker.DockerError, ValueError)
|
630
|
+
except (docker.DockerError, ValueError):
|
632
631
|
if should_raise:
|
633
|
-
raise
|
632
|
+
raise
|
634
633
|
_logger.info("Base image not found. Generating new base image")
|
635
634
|
return False
|
636
635
|
|
wandb/sdk/lib/apikey.py
CHANGED
@@ -21,6 +21,7 @@ import wandb
|
|
21
21
|
from wandb.apis import InternalApi
|
22
22
|
from wandb.errors import term
|
23
23
|
from wandb.errors.links import url_registry
|
24
|
+
from wandb.sdk import wandb_setup
|
24
25
|
from wandb.util import _is_databricks, isatty, prompt_choices
|
25
26
|
|
26
27
|
if TYPE_CHECKING:
|
@@ -237,9 +238,7 @@ def write_netrc(host: str, entity: str, key: str):
|
|
237
238
|
_, key_suffix = key.split("-", 1) if "-" in key else ("", key)
|
238
239
|
if len(key_suffix) != 40:
|
239
240
|
raise ValueError(
|
240
|
-
"API-key must be exactly 40 characters long: {} ({} chars)"
|
241
|
-
key_suffix, len(key_suffix)
|
242
|
-
)
|
241
|
+
f"API-key must be exactly 40 characters long: {key_suffix} ({len(key_suffix)} chars)"
|
243
242
|
)
|
244
243
|
|
245
244
|
normalized_host = urlparse(host).netloc
|
@@ -276,7 +275,7 @@ def write_netrc(host: str, entity: str, key: str):
|
|
276
275
|
elif skip:
|
277
276
|
skip -= 1
|
278
277
|
else:
|
279
|
-
f.write("{}\n"
|
278
|
+
f.write(f"{line}\n")
|
280
279
|
|
281
280
|
wandb.termlog(
|
282
281
|
f"Appending key for {normalized_host} to your netrc file: {netrc_path}"
|
@@ -311,16 +310,14 @@ def write_key(
|
|
311
310
|
_, suffix = key.split("-", 1) if "-" in key else ("", key)
|
312
311
|
|
313
312
|
if len(suffix) != 40:
|
314
|
-
raise ValueError(
|
315
|
-
"API key must be 40 characters long, yours was {}".format(len(key))
|
316
|
-
)
|
313
|
+
raise ValueError(f"API key must be 40 characters long, yours was {len(key)}")
|
317
314
|
|
318
315
|
write_netrc(settings.base_url, "user", key)
|
319
316
|
|
320
317
|
|
321
318
|
def api_key(settings: Settings | None = None) -> str | None:
|
322
319
|
if settings is None:
|
323
|
-
settings =
|
320
|
+
settings = wandb_setup.singleton().settings
|
324
321
|
if settings.api_key:
|
325
322
|
return settings.api_key
|
326
323
|
|
wandb/sdk/lib/config_util.py
CHANGED
@@ -66,13 +66,13 @@ def dict_from_config_file(
|
|
66
66
|
) -> Optional[Dict[str, Any]]:
|
67
67
|
if not os.path.exists(filename):
|
68
68
|
if must_exist:
|
69
|
-
raise ConfigError("config file {} doesn't exist"
|
70
|
-
logger.debug("no default config file found in {}"
|
69
|
+
raise ConfigError(f"config file {filename} doesn't exist")
|
70
|
+
logger.debug(f"no default config file found in {filename}")
|
71
71
|
return None
|
72
72
|
try:
|
73
73
|
conf_file = open(filename)
|
74
74
|
except OSError:
|
75
|
-
raise ConfigError("Couldn't read config file: {}"
|
75
|
+
raise ConfigError(f"Couldn't read config file: {filename}")
|
76
76
|
try:
|
77
77
|
loaded = load_yaml(conf_file)
|
78
78
|
except yaml.parser.ParserError:
|
wandb/sdk/lib/fsm.py
CHANGED
@@ -28,26 +28,11 @@ Usage:
|
|
28
28
|
```
|
29
29
|
"""
|
30
30
|
|
31
|
-
import sys
|
32
31
|
from abc import abstractmethod
|
33
32
|
from dataclasses import dataclass
|
34
|
-
from typing import
|
35
|
-
|
36
|
-
|
37
|
-
Generic,
|
38
|
-
Optional,
|
39
|
-
Protocol,
|
40
|
-
Sequence,
|
41
|
-
Type,
|
42
|
-
TypeVar,
|
43
|
-
Union,
|
44
|
-
runtime_checkable,
|
45
|
-
)
|
46
|
-
|
47
|
-
if sys.version_info >= (3, 10):
|
48
|
-
from typing import TypeAlias
|
49
|
-
else:
|
50
|
-
from typing_extensions import TypeAlias
|
33
|
+
from typing import Callable, Dict, Generic, Optional, Sequence, Type, Union
|
34
|
+
|
35
|
+
from typing_extensions import Protocol, TypeAlias, TypeVar, runtime_checkable
|
51
36
|
|
52
37
|
T_FsmInputs = TypeVar("T_FsmInputs", contravariant=True)
|
53
38
|
T_FsmContext = TypeVar("T_FsmContext")
|