wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -3
- wandb/apis/__init__.py +1 -3
- wandb/apis/importers/__init__.py +4 -0
- wandb/apis/importers/base.py +312 -0
- wandb/apis/importers/mlflow.py +113 -0
- wandb/apis/internal.py +29 -2
- wandb/apis/normalize.py +6 -5
- wandb/apis/public.py +163 -180
- wandb/apis/reports/_templates.py +6 -12
- wandb/apis/reports/report.py +1 -1
- wandb/apis/reports/runset.py +1 -3
- wandb/apis/reports/util.py +12 -10
- wandb/beta/workflows.py +57 -34
- wandb/catboost/__init__.py +1 -2
- wandb/cli/cli.py +215 -133
- wandb/data_types.py +63 -56
- wandb/docker/__init__.py +78 -16
- wandb/docker/auth.py +21 -22
- wandb/env.py +0 -1
- wandb/errors/__init__.py +8 -116
- wandb/errors/term.py +1 -1
- wandb/fastai/__init__.py +1 -2
- wandb/filesync/dir_watcher.py +8 -5
- wandb/filesync/step_prepare.py +76 -75
- wandb/filesync/step_upload.py +1 -2
- wandb/integration/catboost/__init__.py +1 -3
- wandb/integration/catboost/catboost.py +8 -14
- wandb/integration/fastai/__init__.py +7 -13
- wandb/integration/gym/__init__.py +35 -4
- wandb/integration/keras/__init__.py +3 -3
- wandb/integration/keras/callbacks/metrics_logger.py +9 -8
- wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
- wandb/integration/keras/callbacks/tables_builder.py +31 -19
- wandb/integration/kfp/kfp_patch.py +20 -17
- wandb/integration/kfp/wandb_logging.py +1 -2
- wandb/integration/lightgbm/__init__.py +21 -19
- wandb/integration/prodigy/prodigy.py +6 -7
- wandb/integration/sacred/__init__.py +9 -12
- wandb/integration/sagemaker/__init__.py +1 -3
- wandb/integration/sagemaker/auth.py +0 -1
- wandb/integration/sagemaker/config.py +1 -1
- wandb/integration/sagemaker/resources.py +1 -1
- wandb/integration/sb3/sb3.py +8 -4
- wandb/integration/tensorboard/__init__.py +1 -3
- wandb/integration/tensorboard/log.py +8 -8
- wandb/integration/tensorboard/monkeypatch.py +11 -9
- wandb/integration/tensorflow/__init__.py +1 -3
- wandb/integration/xgboost/__init__.py +4 -6
- wandb/integration/yolov8/__init__.py +7 -0
- wandb/integration/yolov8/yolov8.py +250 -0
- wandb/jupyter.py +31 -35
- wandb/lightgbm/__init__.py +1 -2
- wandb/old/settings.py +2 -2
- wandb/plot/bar.py +1 -2
- wandb/plot/confusion_matrix.py +1 -3
- wandb/plot/histogram.py +1 -2
- wandb/plot/line.py +1 -2
- wandb/plot/line_series.py +4 -4
- wandb/plot/pr_curve.py +17 -20
- wandb/plot/roc_curve.py +1 -3
- wandb/plot/scatter.py +1 -2
- wandb/proto/v3/wandb_server_pb2.py +85 -39
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_server_pb2.py +51 -39
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/__init__.py +1 -3
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/data_types/_dtypes.py +38 -30
- wandb/sdk/data_types/base_types/json_metadata.py +1 -3
- wandb/sdk/data_types/base_types/media.py +17 -17
- wandb/sdk/data_types/base_types/wb_value.py +33 -26
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
- wandb/sdk/data_types/helper_types/classes.py +1 -1
- wandb/sdk/data_types/helper_types/image_mask.py +12 -12
- wandb/sdk/data_types/histogram.py +5 -4
- wandb/sdk/data_types/html.py +1 -2
- wandb/sdk/data_types/image.py +11 -11
- wandb/sdk/data_types/molecule.py +3 -6
- wandb/sdk/data_types/object_3d.py +1 -2
- wandb/sdk/data_types/plotly.py +1 -2
- wandb/sdk/data_types/saved_model.py +10 -8
- wandb/sdk/data_types/video.py +1 -1
- wandb/sdk/integration_utils/data_logging.py +5 -5
- wandb/sdk/interface/artifacts.py +288 -266
- wandb/sdk/interface/interface.py +2 -3
- wandb/sdk/interface/interface_grpc.py +1 -1
- wandb/sdk/interface/interface_queue.py +1 -1
- wandb/sdk/interface/interface_relay.py +1 -1
- wandb/sdk/interface/interface_shared.py +1 -2
- wandb/sdk/interface/interface_sock.py +1 -1
- wandb/sdk/interface/message_future.py +1 -1
- wandb/sdk/interface/message_future_poll.py +1 -1
- wandb/sdk/interface/router.py +1 -1
- wandb/sdk/interface/router_queue.py +1 -1
- wandb/sdk/interface/router_relay.py +1 -1
- wandb/sdk/interface/router_sock.py +1 -1
- wandb/sdk/interface/summary_record.py +1 -1
- wandb/sdk/internal/artifacts.py +1 -1
- wandb/sdk/internal/datastore.py +2 -3
- wandb/sdk/internal/file_pusher.py +5 -3
- wandb/sdk/internal/file_stream.py +22 -19
- wandb/sdk/internal/handler.py +5 -4
- wandb/sdk/internal/internal.py +1 -1
- wandb/sdk/internal/internal_api.py +115 -55
- wandb/sdk/internal/job_builder.py +1 -3
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/progress.py +4 -6
- wandb/sdk/internal/sample.py +1 -3
- wandb/sdk/internal/sender.py +28 -16
- wandb/sdk/internal/settings_static.py +5 -5
- wandb/sdk/internal/system/assets/__init__.py +1 -0
- wandb/sdk/internal/system/assets/cpu.py +3 -9
- wandb/sdk/internal/system/assets/disk.py +2 -4
- wandb/sdk/internal/system/assets/gpu.py +6 -18
- wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
- wandb/sdk/internal/system/assets/interfaces.py +50 -22
- wandb/sdk/internal/system/assets/ipu.py +1 -3
- wandb/sdk/internal/system/assets/memory.py +7 -13
- wandb/sdk/internal/system/assets/network.py +4 -8
- wandb/sdk/internal/system/assets/open_metrics.py +283 -0
- wandb/sdk/internal/system/assets/tpu.py +1 -4
- wandb/sdk/internal/system/assets/trainium.py +26 -14
- wandb/sdk/internal/system/system_info.py +2 -3
- wandb/sdk/internal/system/system_monitor.py +52 -20
- wandb/sdk/internal/tb_watcher.py +12 -13
- wandb/sdk/launch/_project_spec.py +54 -65
- wandb/sdk/launch/agent/agent.py +374 -90
- wandb/sdk/launch/builder/abstract.py +61 -7
- wandb/sdk/launch/builder/build.py +81 -110
- wandb/sdk/launch/builder/docker_builder.py +181 -0
- wandb/sdk/launch/builder/kaniko_builder.py +419 -0
- wandb/sdk/launch/builder/noop.py +31 -12
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
- wandb/sdk/launch/environment/abstract.py +28 -0
- wandb/sdk/launch/environment/aws_environment.py +276 -0
- wandb/sdk/launch/environment/gcp_environment.py +271 -0
- wandb/sdk/launch/environment/local_environment.py +65 -0
- wandb/sdk/launch/github_reference.py +3 -8
- wandb/sdk/launch/launch.py +38 -29
- wandb/sdk/launch/launch_add.py +6 -8
- wandb/sdk/launch/loader.py +230 -0
- wandb/sdk/launch/registry/abstract.py +54 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
- wandb/sdk/launch/registry/local_registry.py +62 -0
- wandb/sdk/launch/runner/abstract.py +1 -16
- wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
- wandb/sdk/launch/runner/local_container.py +46 -22
- wandb/sdk/launch/runner/local_process.py +1 -4
- wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
- wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
- wandb/sdk/launch/sweeps/__init__.py +3 -2
- wandb/sdk/launch/sweeps/scheduler.py +132 -39
- wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
- wandb/sdk/launch/utils.py +101 -30
- wandb/sdk/launch/wandb_reference.py +2 -7
- wandb/sdk/lib/_settings_toposort_generate.py +166 -0
- wandb/sdk/lib/_settings_toposort_generated.py +201 -0
- wandb/sdk/lib/apikey.py +2 -4
- wandb/sdk/lib/config_util.py +4 -1
- wandb/sdk/lib/console.py +1 -3
- wandb/sdk/lib/deprecate.py +3 -3
- wandb/sdk/lib/file_stream_utils.py +7 -5
- wandb/sdk/lib/filenames.py +1 -1
- wandb/sdk/lib/filesystem.py +61 -5
- wandb/sdk/lib/git.py +1 -3
- wandb/sdk/lib/import_hooks.py +4 -7
- wandb/sdk/lib/ipython.py +8 -5
- wandb/sdk/lib/lazyloader.py +1 -3
- wandb/sdk/lib/mailbox.py +14 -4
- wandb/sdk/lib/proto_util.py +10 -5
- wandb/sdk/lib/redirect.py +15 -22
- wandb/sdk/lib/reporting.py +1 -3
- wandb/sdk/lib/retry.py +4 -5
- wandb/sdk/lib/runid.py +1 -3
- wandb/sdk/lib/server.py +15 -9
- wandb/sdk/lib/sock_client.py +1 -1
- wandb/sdk/lib/sparkline.py +1 -1
- wandb/sdk/lib/wburls.py +1 -1
- wandb/sdk/service/port_file.py +1 -2
- wandb/sdk/service/service.py +36 -13
- wandb/sdk/service/service_base.py +12 -1
- wandb/sdk/verify/verify.py +5 -7
- wandb/sdk/wandb_artifacts.py +142 -177
- wandb/sdk/wandb_config.py +5 -8
- wandb/sdk/wandb_helper.py +1 -1
- wandb/sdk/wandb_init.py +24 -13
- wandb/sdk/wandb_login.py +9 -9
- wandb/sdk/wandb_manager.py +39 -4
- wandb/sdk/wandb_metric.py +2 -6
- wandb/sdk/wandb_require.py +4 -15
- wandb/sdk/wandb_require_helpers.py +1 -9
- wandb/sdk/wandb_run.py +95 -141
- wandb/sdk/wandb_save.py +1 -3
- wandb/sdk/wandb_settings.py +149 -54
- wandb/sdk/wandb_setup.py +66 -46
- wandb/sdk/wandb_summary.py +13 -10
- wandb/sdk/wandb_sweep.py +6 -7
- wandb/sdk/wandb_watch.py +1 -1
- wandb/sklearn/calculate/confusion_matrix.py +1 -1
- wandb/sklearn/calculate/learning_curve.py +1 -1
- wandb/sklearn/calculate/summary_metrics.py +1 -3
- wandb/sklearn/plot/__init__.py +1 -1
- wandb/sklearn/plot/classifier.py +27 -18
- wandb/sklearn/plot/clusterer.py +4 -5
- wandb/sklearn/plot/regressor.py +4 -4
- wandb/sklearn/plot/shared.py +2 -2
- wandb/sync/__init__.py +1 -3
- wandb/sync/sync.py +4 -5
- wandb/testing/relay.py +11 -10
- wandb/trigger.py +1 -1
- wandb/util.py +106 -81
- wandb/viz.py +4 -4
- wandb/wandb_agent.py +50 -50
- wandb/wandb_controller.py +2 -3
- wandb/wandb_run.py +1 -2
- wandb/wandb_torch.py +1 -1
- wandb/xgboost/__init__.py +1 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
- wandb/sdk/launch/builder/docker.py +0 -80
- wandb/sdk/launch/builder/kaniko.py +0 -393
- wandb/sdk/launch/builder/loader.py +0 -32
- wandb/sdk/launch/runner/loader.py +0 -50
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -92,9 +92,7 @@ class _Stats:
|
|
92
92
|
|
93
93
|
|
94
94
|
class NeuronCoreStats:
|
95
|
-
"""
|
96
|
-
AWS Trainium stats.
|
97
|
-
"""
|
95
|
+
"""AWS Trainium stats."""
|
98
96
|
|
99
97
|
name: str = "trn.{key}"
|
100
98
|
samples: "Deque[_Stats]"
|
@@ -124,7 +122,7 @@ class NeuronCoreStats:
|
|
124
122
|
) as process:
|
125
123
|
while not self.shutdown_event.is_set():
|
126
124
|
if process.stdout is None:
|
127
|
-
|
125
|
+
self.shutdown_event.wait(0.1)
|
128
126
|
continue
|
129
127
|
|
130
128
|
raw_data = process.stdout.readline()
|
@@ -151,6 +149,15 @@ class NeuronCoreStats:
|
|
151
149
|
self.samples: "Deque[_Stats]" = deque()
|
152
150
|
self.shutdown_event = threading.Event()
|
153
151
|
|
152
|
+
self.neuron_monitor_thread: Optional[threading.Thread] = None
|
153
|
+
|
154
|
+
def setup(self) -> None:
|
155
|
+
"""Start the neuron-monitor thread for collecting raw data."""
|
156
|
+
if self.neuron_monitor_thread is not None:
|
157
|
+
return
|
158
|
+
|
159
|
+
logger.debug("Starting neuron-monitor thread")
|
160
|
+
self.shutdown_event.clear()
|
154
161
|
self.neuron_monitor_thread = threading.Thread(
|
155
162
|
name="NeuronCoreMntr",
|
156
163
|
target=self.neuron_monitor,
|
@@ -158,9 +165,20 @@ class NeuronCoreStats:
|
|
158
165
|
)
|
159
166
|
self.neuron_monitor_thread.start()
|
160
167
|
|
168
|
+
def teardown(self) -> None:
|
169
|
+
"""Stop the neuron-monitor thread."""
|
170
|
+
logger.debug("Stopping neuron-monitor thread")
|
171
|
+
try:
|
172
|
+
self.shutdown_event.set()
|
173
|
+
assert self.neuron_monitor_thread is not None
|
174
|
+
self.neuron_monitor_thread.join()
|
175
|
+
except Exception as e:
|
176
|
+
logger.error("neuron-monitor thread failed to stop: %s" % e)
|
177
|
+
finally:
|
178
|
+
self.neuron_monitor_thread = None
|
179
|
+
|
161
180
|
def _is_matching_entry(self, entry: dict) -> bool:
|
162
|
-
"""
|
163
|
-
For now, only check if the pid in the entry matches the pid of the process.
|
181
|
+
"""For now, only check if the pid in the entry matches the pid of the process.
|
164
182
|
|
165
183
|
todo: add matching by neuron_runtime_tag
|
166
184
|
"""
|
@@ -218,9 +236,7 @@ class NeuronCoreStats:
|
|
218
236
|
|
219
237
|
@staticmethod
|
220
238
|
def flatten_stats(sample: _Stats) -> dict:
|
221
|
-
"""
|
222
|
-
Flatten _Stats object into a flat dict of numbers.
|
223
|
-
"""
|
239
|
+
"""Flatten _Stats object into a flat dict of numbers."""
|
224
240
|
flattened = {}
|
225
241
|
|
226
242
|
def helper(key: str, value: Any) -> None:
|
@@ -302,6 +318,7 @@ class Trainium:
|
|
302
318
|
# on some systems that do not have the hardware
|
303
319
|
try:
|
304
320
|
# redirect stderr to null to avoid printing errors to the console
|
321
|
+
# todo: alternative: check /dev/neuron0 ? sysfs support coming soon in neuron tools
|
305
322
|
output = subprocess.check_output(
|
306
323
|
NEURON_LS_COMMAND,
|
307
324
|
universal_newlines=True,
|
@@ -319,11 +336,6 @@ class Trainium:
|
|
319
336
|
|
320
337
|
def finish(self) -> None:
|
321
338
|
self.metrics_monitor.finish()
|
322
|
-
# stop the raw data acquisition threads
|
323
|
-
for metric in self.metrics:
|
324
|
-
if hasattr(metric, "shutdown_event"):
|
325
|
-
logger.debug("Stopping neuron-monitor thread")
|
326
|
-
metric.shutdown_event.set()
|
327
339
|
|
328
340
|
def probe(self) -> dict:
|
329
341
|
try:
|
@@ -47,7 +47,7 @@ class SystemInfo:
|
|
47
47
|
|
48
48
|
# todo: refactor these _save_* methods
|
49
49
|
def _save_pip(self) -> None:
|
50
|
-
"""
|
50
|
+
"""Save the current working set of pip packages to {REQUIREMENTS_FNAME}."""
|
51
51
|
logger.debug(
|
52
52
|
"Saving list of pip packages installed into the current environment"
|
53
53
|
)
|
@@ -220,8 +220,7 @@ class SystemInfo:
|
|
220
220
|
if self.settings._jupyter_path.startswith("fileId="):
|
221
221
|
unescaped = unquote(self.settings._jupyter_path)
|
222
222
|
data["colab"] = (
|
223
|
-
"https://colab.research.google.com/notebook#"
|
224
|
-
+ unescaped # noqa
|
223
|
+
"https://colab.research.google.com/notebook#" + unescaped
|
225
224
|
)
|
226
225
|
data["program"] = self.settings._jupyter_name
|
227
226
|
else:
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
|
6
6
|
|
7
7
|
from .assets.asset_registry import asset_registry
|
8
8
|
from .assets.interfaces import Asset, Interface
|
9
|
+
from .assets.open_metrics import OpenMetrics
|
9
10
|
from .system_info import SystemInfo
|
10
11
|
|
11
12
|
if TYPE_CHECKING:
|
@@ -47,6 +48,8 @@ class SystemMonitor:
|
|
47
48
|
self._shutdown_event: mp.synchronize.Event = mp.Event()
|
48
49
|
self._process: Optional[Union[mp.Process, threading.Thread]] = None
|
49
50
|
|
51
|
+
self.settings = settings
|
52
|
+
|
50
53
|
# settings._stats_join_assets controls whether we should join stats from different assets
|
51
54
|
# before publishing them to the backend. If set to False, we will publish stats from each
|
52
55
|
# asset separately, using the backend interface. If set to True, we will aggregate stats from
|
@@ -59,14 +62,16 @@ class SystemMonitor:
|
|
59
62
|
sampling_interval: float = float(
|
60
63
|
max(
|
61
64
|
0.1,
|
62
|
-
settings._stats_sample_rate_seconds,
|
65
|
+
self.settings._stats_sample_rate_seconds,
|
63
66
|
)
|
64
67
|
) # seconds
|
65
68
|
# The number of samples to aggregate (e.g. average or compute max/min etc.)
|
66
69
|
# before publishing; defaults to 15; valid range: [1:30]
|
67
|
-
samples_to_aggregate: int = min(
|
70
|
+
samples_to_aggregate: int = min(
|
71
|
+
30, max(1, self.settings._stats_samples_to_average)
|
72
|
+
)
|
68
73
|
self.publishing_interval: float = sampling_interval * samples_to_aggregate
|
69
|
-
self.join_assets: bool = settings._stats_join_assets
|
74
|
+
self.join_assets: bool = self.settings._stats_join_assets
|
70
75
|
|
71
76
|
self.backend_interface = interface
|
72
77
|
self.asset_interface: Optional[AssetInterface] = (
|
@@ -74,21 +79,47 @@ class SystemMonitor:
|
|
74
79
|
)
|
75
80
|
|
76
81
|
# hardware assets
|
77
|
-
self.assets: List["Asset"] =
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
interface=self.asset_interface or self.backend_interface,
|
82
|
-
settings=settings,
|
83
|
-
shutdown_event=self._shutdown_event,
|
84
|
-
)
|
85
|
-
)
|
82
|
+
self.assets: List["Asset"] = self._get_assets()
|
83
|
+
|
84
|
+
# OpenMetrics/Prometheus-compatible endpoints
|
85
|
+
self.assets.extend(self._get_open_metrics_assets())
|
86
86
|
|
87
87
|
# static system info, both hardware and software
|
88
88
|
self.system_info: SystemInfo = SystemInfo(
|
89
|
-
settings=settings, interface=interface
|
89
|
+
settings=self.settings, interface=interface
|
90
90
|
)
|
91
91
|
|
92
|
+
def _get_assets(self) -> List["Asset"]:
|
93
|
+
return [
|
94
|
+
asset_class(
|
95
|
+
interface=self.asset_interface or self.backend_interface,
|
96
|
+
settings=self.settings,
|
97
|
+
shutdown_event=self._shutdown_event,
|
98
|
+
)
|
99
|
+
for asset_class in asset_registry
|
100
|
+
]
|
101
|
+
|
102
|
+
def _get_open_metrics_assets(self) -> List["Asset"]:
|
103
|
+
open_metrics_endpoints = self.settings._stats_open_metrics_endpoints
|
104
|
+
if not open_metrics_endpoints:
|
105
|
+
return []
|
106
|
+
|
107
|
+
assets: List[Asset] = []
|
108
|
+
for name, endpoint in open_metrics_endpoints.items():
|
109
|
+
if not OpenMetrics.is_available(url=endpoint):
|
110
|
+
continue
|
111
|
+
logger.debug(f"Monitoring OpenMetrics endpoint: {endpoint}")
|
112
|
+
open_metrics = OpenMetrics(
|
113
|
+
interface=self.asset_interface or self.backend_interface,
|
114
|
+
settings=self.settings,
|
115
|
+
shutdown_event=self._shutdown_event,
|
116
|
+
name=name,
|
117
|
+
url=endpoint,
|
118
|
+
)
|
119
|
+
assets.append(open_metrics) # type: ignore
|
120
|
+
|
121
|
+
return assets
|
122
|
+
|
92
123
|
def aggregate_and_publish_asset_metrics(self) -> None:
|
93
124
|
if self.asset_interface is None:
|
94
125
|
return None
|
@@ -147,13 +178,14 @@ class SystemMonitor:
|
|
147
178
|
|
148
179
|
def start(self) -> None:
|
149
180
|
self._shutdown_event.clear()
|
150
|
-
if self._process is None:
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
181
|
+
if self._process is not None:
|
182
|
+
return None
|
183
|
+
logger.info("Starting system monitor")
|
184
|
+
# self._process = mp.Process(target=self._start, name="SystemMonitor")
|
185
|
+
self._process = threading.Thread(
|
186
|
+
target=self._start, daemon=True, name="SystemMonitor"
|
187
|
+
)
|
188
|
+
self._process.start()
|
157
189
|
|
158
190
|
def finish(self) -> None:
|
159
191
|
if self._process is None:
|
wandb/sdk/internal/tb_watcher.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1
|
-
"""
|
2
|
-
tensorboard watcher.
|
3
|
-
"""
|
1
|
+
"""tensorboard watcher."""
|
4
2
|
|
5
3
|
import glob
|
6
4
|
import logging
|
@@ -61,7 +59,7 @@ def _link_and_save_file(
|
|
61
59
|
|
62
60
|
|
63
61
|
def is_tfevents_file_created_by(path: str, hostname: str, start_time: float) -> bool:
|
64
|
-
"""
|
62
|
+
"""Check if a path is a tfevents file created by hostname.
|
65
63
|
|
66
64
|
tensorboard tfevents filename format:
|
67
65
|
https://github.com/tensorflow/tensorboard/blob/f3f26b46981da5bd46a5bb93fcf02d9eb7608bc1/tensorboard/summary/writer/event_file_writer.py#L81
|
@@ -95,7 +93,7 @@ def is_tfevents_file_created_by(path: str, hostname: str, start_time: float) ->
|
|
95
93
|
# TODO: we should also check the PID (also contained in the tfevents
|
96
94
|
# filename). Can we assume that our parent pid is the user process
|
97
95
|
# that wrote these files?
|
98
|
-
return created_time >= int(start_time)
|
96
|
+
return created_time >= int(start_time)
|
99
97
|
|
100
98
|
|
101
99
|
class TBWatcher:
|
@@ -216,7 +214,7 @@ class TBDirWatcher:
|
|
216
214
|
self._thread.start()
|
217
215
|
|
218
216
|
def _is_our_tfevents_file(self, path: str) -> bool:
|
219
|
-
"""
|
217
|
+
"""Check if a path has been modified since launch and contains tfevents."""
|
220
218
|
if not path:
|
221
219
|
raise ValueError("Path must be a nonempty string")
|
222
220
|
if self._force:
|
@@ -229,7 +227,7 @@ class TBDirWatcher:
|
|
229
227
|
def _loader(
|
230
228
|
self, save: bool = True, namespace: Optional[str] = None
|
231
229
|
) -> "EventFileLoader":
|
232
|
-
"""Incredibly hacky class generator to optionally save / prefix tfevent files"""
|
230
|
+
"""Incredibly hacky class generator to optionally save / prefix tfevent files."""
|
233
231
|
_loader_interface = self._tbwatcher._interface
|
234
232
|
_loader_settings = self._tbwatcher._settings
|
235
233
|
try:
|
@@ -285,7 +283,7 @@ class TBDirWatcher:
|
|
285
283
|
raise e
|
286
284
|
|
287
285
|
def _thread_body(self) -> None:
|
288
|
-
"""Check for new events every second"""
|
286
|
+
"""Check for new events every second."""
|
289
287
|
shutdown_time: Optional[float] = None
|
290
288
|
while True:
|
291
289
|
self._process_events()
|
@@ -318,7 +316,7 @@ class TBDirWatcher:
|
|
318
316
|
|
319
317
|
|
320
318
|
class Event:
|
321
|
-
"""An event wrapper to enable priority queueing"""
|
319
|
+
"""An event wrapper to enable priority queueing."""
|
322
320
|
|
323
321
|
def __init__(self, event: "ProtoEvent", namespace: Optional[str]):
|
324
322
|
self.event = event
|
@@ -332,10 +330,11 @@ class Event:
|
|
332
330
|
|
333
331
|
|
334
332
|
class TBEventConsumer:
|
335
|
-
"""
|
336
|
-
|
337
|
-
|
338
|
-
out of order
|
333
|
+
"""Consume tfevents from a priority queue.
|
334
|
+
|
335
|
+
There should always only be one of these per run_manager. We wait for 10 seconds of
|
336
|
+
queued events to reduce the chance of multiple tfevent files triggering out of order
|
337
|
+
steps.
|
339
338
|
"""
|
340
339
|
|
341
340
|
def __init__(
|
@@ -1,8 +1,7 @@
|
|
1
|
+
"""Convert launch arguments into a runnable wandb launch script.
|
2
|
+
|
3
|
+
Arguments can come from a launch spec or call to wandb launch.
|
1
4
|
"""
|
2
|
-
Internal utility for converting arguments from a launch spec or call to wandb launch
|
3
|
-
into a runnable wandb launch script
|
4
|
-
"""
|
5
|
-
import binascii
|
6
5
|
import enum
|
7
6
|
import json
|
8
7
|
import logging
|
@@ -15,11 +14,11 @@ import wandb
|
|
15
14
|
import wandb.docker as docker
|
16
15
|
from wandb.apis.internal import Api
|
17
16
|
from wandb.apis.public import Artifact as PublicArtifact
|
18
|
-
from wandb.errors import CommError
|
17
|
+
from wandb.errors import CommError
|
19
18
|
from wandb.sdk.lib.runid import generate_id
|
20
19
|
|
21
20
|
from . import utils
|
22
|
-
from .utils import LOG_PREFIX
|
21
|
+
from .utils import LOG_PREFIX, LaunchError
|
23
22
|
|
24
23
|
_logger = logging.getLogger(__name__)
|
25
24
|
|
@@ -60,7 +59,6 @@ class LaunchProject:
|
|
60
59
|
overrides: Dict[str, Any],
|
61
60
|
resource: str,
|
62
61
|
resource_args: Dict[str, Any],
|
63
|
-
cuda: Optional[bool],
|
64
62
|
run_id: Optional[str],
|
65
63
|
):
|
66
64
|
if uri is not None and utils.is_bare_wandb_uri(uri):
|
@@ -68,17 +66,24 @@ class LaunchProject:
|
|
68
66
|
_logger.info(f"{LOG_PREFIX}Updating uri with base uri: {uri}")
|
69
67
|
self.uri = uri
|
70
68
|
self.job = job
|
71
|
-
|
69
|
+
if job is not None:
|
70
|
+
wandb.termlog(f"{LOG_PREFIX}Launching job: {job}")
|
72
71
|
self._job_artifact: Optional[PublicArtifact] = None
|
73
72
|
self.api = api
|
74
73
|
self.launch_spec = launch_spec
|
75
74
|
self.target_entity = target_entity
|
76
75
|
self.target_project = target_project.lower()
|
77
76
|
self.name = name # TODO: replace with run_id
|
77
|
+
# the builder key can be passed in through the resource args
|
78
|
+
# but these resource_args are then passed to the appropriate
|
79
|
+
# runner, so we need to pop the builder key out
|
80
|
+
resource_args_build = resource_args.get(resource, {}).pop("builder", {})
|
78
81
|
self.resource = resource
|
79
82
|
self.resource_args = resource_args
|
80
83
|
self.python_version: Optional[str] = launch_spec.get("python_version")
|
81
|
-
self.
|
84
|
+
self.cuda_base_image: Optional[str] = resource_args_build.get("cuda", {}).get(
|
85
|
+
"base_image"
|
86
|
+
)
|
82
87
|
self._base_image: Optional[str] = launch_spec.get("base_image")
|
83
88
|
self.docker_image: Optional[str] = docker_config.get(
|
84
89
|
"docker_image"
|
@@ -95,11 +100,8 @@ class LaunchProject:
|
|
95
100
|
self.override_artifacts: Dict[str, Any] = overrides.get("artifacts", {})
|
96
101
|
self.override_entrypoint: Optional[EntryPoint] = None
|
97
102
|
self.deps_type: Optional[str] = None
|
98
|
-
self.cuda = cuda
|
99
103
|
self._runtime: Optional[str] = None
|
100
104
|
self.run_id = run_id or generate_id()
|
101
|
-
self._image_tag: str = self._initialize_image_job_tag() or self.run_id
|
102
|
-
wandb.termlog(f"{LOG_PREFIX}Launch project using image tag {self._image_tag}")
|
103
105
|
self._entry_points: Dict[
|
104
106
|
str, EntryPoint
|
105
107
|
] = {} # todo: keep multiple entrypoint support?
|
@@ -139,15 +141,13 @@ class LaunchProject:
|
|
139
141
|
)
|
140
142
|
self.source = LaunchSource.LOCAL
|
141
143
|
self.project_dir = self.uri
|
142
|
-
if launch_spec.get("resource_args"):
|
143
|
-
self.resource_args = launch_spec["resource_args"]
|
144
144
|
|
145
145
|
self.aux_dir = tempfile.mkdtemp()
|
146
146
|
self.clear_parameter_run_config_collisions()
|
147
147
|
|
148
148
|
@property
|
149
149
|
def base_image(self) -> str:
|
150
|
-
"""Returns {PROJECT}_base:{PYTHON_VERSION}"""
|
150
|
+
"""Returns {PROJECT}_base:{PYTHON_VERSION}."""
|
151
151
|
# TODO: this should likely be source_project when we have it...
|
152
152
|
|
153
153
|
# don't make up a separate base image name if user provides a docker image
|
@@ -174,25 +174,15 @@ class LaunchProject:
|
|
174
174
|
assert self.job is not None
|
175
175
|
return wandb.util.make_docker_image_name_safe(self.job.split(":")[0])
|
176
176
|
|
177
|
-
def
|
178
|
-
if
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
@property
|
187
|
-
def image_uri(self) -> str:
|
188
|
-
if self.docker_image:
|
189
|
-
return self.docker_image
|
190
|
-
return f"{self.image_name}:{self.image_tag}"
|
191
|
-
|
192
|
-
@property
|
193
|
-
def image_tag(self) -> str:
|
194
|
-
|
195
|
-
return self._image_tag[:IMAGE_TAG_MAX_LENGTH]
|
177
|
+
def build_required(self) -> bool:
|
178
|
+
"""Checks the source to see if a build is required."""
|
179
|
+
# since the image tag for images built from jobs
|
180
|
+
# is based on the job version index, which is immutable
|
181
|
+
# we don't need to build the image for a job if that tag
|
182
|
+
# already exists
|
183
|
+
if self.source != LaunchSource.JOB:
|
184
|
+
return True
|
185
|
+
return False
|
196
186
|
|
197
187
|
@property
|
198
188
|
def docker_image(self) -> Optional[str]:
|
@@ -225,7 +215,7 @@ class LaunchProject:
|
|
225
215
|
return list(self._entry_points.values())[0]
|
226
216
|
|
227
217
|
def add_entry_point(self, command: List[str]) -> "EntryPoint":
|
228
|
-
"""
|
218
|
+
"""Add an entry point to the project."""
|
229
219
|
entry_point = command[-1]
|
230
220
|
new_entrypoint = EntryPoint(name=entry_point, command=command)
|
231
221
|
self._entry_points[entry_point] = new_entrypoint
|
@@ -243,10 +233,37 @@ class LaunchProject:
|
|
243
233
|
try:
|
244
234
|
job = public_api.job(self.job, path=job_dir)
|
245
235
|
except CommError:
|
246
|
-
raise LaunchError(
|
236
|
+
raise LaunchError(
|
237
|
+
f"Job {self.job} not found. Jobs have the format: <entity>/<project>/<name>:<alias>"
|
238
|
+
)
|
247
239
|
job.configure_launch_project(self)
|
248
240
|
self._job_artifact = job._job_artifact
|
249
241
|
|
242
|
+
def get_image_source_string(self) -> str:
|
243
|
+
"""Returns a unique string identifying the source of an image."""
|
244
|
+
if self.source == LaunchSource.LOCAL:
|
245
|
+
# TODO: more correct to get a hash of local uri contents
|
246
|
+
assert isinstance(self.uri, str)
|
247
|
+
return self.uri
|
248
|
+
elif self.source == LaunchSource.JOB:
|
249
|
+
assert self._job_artifact is not None
|
250
|
+
return f"{self._job_artifact.name}:v{self._job_artifact.version}"
|
251
|
+
elif self.source == LaunchSource.GIT:
|
252
|
+
assert isinstance(self.uri, str)
|
253
|
+
ret = self.uri
|
254
|
+
if self.git_version:
|
255
|
+
ret += self.git_version
|
256
|
+
return ret
|
257
|
+
elif self.source == LaunchSource.WANDB:
|
258
|
+
assert isinstance(self.uri, str)
|
259
|
+
return self.uri
|
260
|
+
elif self.source == LaunchSource.DOCKER:
|
261
|
+
assert isinstance(self.docker_image, str)
|
262
|
+
_logger.debug("")
|
263
|
+
return self.docker_image
|
264
|
+
else:
|
265
|
+
raise LaunchError("Unknown source type when determing image source string")
|
266
|
+
|
250
267
|
def _fetch_project_local(self, internal_api: Api) -> None:
|
251
268
|
"""Fetch a project (either wandb run or git repo) into a local directory, returning the path to the local project directory."""
|
252
269
|
# these asserts are all guaranteed to pass, but are required by mypy
|
@@ -263,24 +280,6 @@ class LaunchProject:
|
|
263
280
|
)
|
264
281
|
program_name = run_info.get("codePath") or run_info["program"]
|
265
282
|
|
266
|
-
if run_info.get("cudaVersion"):
|
267
|
-
original_cuda_version = ".".join(run_info["cudaVersion"].split(".")[:2])
|
268
|
-
|
269
|
-
if self.cuda is None:
|
270
|
-
# only set cuda on by default if cuda is None (unspecified), not False (user specifically requested cpu image)
|
271
|
-
wandb.termlog(
|
272
|
-
f"{LOG_PREFIX}Original wandb run {source_run_name} was run with cuda version {original_cuda_version}. Enabling cuda builds by default; to build on a CPU-only image, run again with --cuda=False"
|
273
|
-
)
|
274
|
-
self.cuda_version = original_cuda_version
|
275
|
-
self.cuda = True
|
276
|
-
if (
|
277
|
-
self.cuda
|
278
|
-
and self.cuda_version
|
279
|
-
and self.cuda_version != original_cuda_version
|
280
|
-
):
|
281
|
-
wandb.termlog(
|
282
|
-
f"{LOG_PREFIX}Specified cuda version {self.cuda_version} differs from original cuda version {original_cuda_version}. Running with specified version {self.cuda_version}"
|
283
|
-
)
|
284
283
|
self.python_version = run_info.get("python", "3")
|
285
284
|
downloaded_code_artifact = utils.check_and_download_code_artifacts(
|
286
285
|
source_entity,
|
@@ -289,11 +288,7 @@ class LaunchProject:
|
|
289
288
|
internal_api,
|
290
289
|
self.project_dir,
|
291
290
|
)
|
292
|
-
if downloaded_code_artifact:
|
293
|
-
self._image_tag = binascii.hexlify(
|
294
|
-
downloaded_code_artifact.digest.encode()
|
295
|
-
).decode()
|
296
|
-
else:
|
291
|
+
if not downloaded_code_artifact:
|
297
292
|
if not run_info["git"]:
|
298
293
|
raise LaunchError(
|
299
294
|
"Reproducing a run requires either an associated git repo or a code artifact logged with `run.log_code()`"
|
@@ -308,12 +303,8 @@ class LaunchProject:
|
|
308
303
|
patch = utils.fetch_project_diff(
|
309
304
|
source_entity, source_project, source_run_name, internal_api
|
310
305
|
)
|
311
|
-
tag_string = run_info["git"]["remote"] + run_info["git"]["commit"]
|
312
306
|
if patch:
|
313
307
|
utils.apply_patch(patch, self.project_dir)
|
314
|
-
tag_string += patch
|
315
|
-
|
316
|
-
self._image_tag = binascii.hexlify(tag_string.encode()).decode()
|
317
308
|
|
318
309
|
# For cases where the entry point wasn't checked into git
|
319
310
|
if not os.path.exists(os.path.join(self.project_dir, program_name)):
|
@@ -434,7 +425,6 @@ def create_project_from_spec(launch_spec: Dict[str, Any], api: Api) -> LaunchPro
|
|
434
425
|
Returns:
|
435
426
|
An initialized `LaunchProject` object
|
436
427
|
"""
|
437
|
-
|
438
428
|
name: Optional[str] = None
|
439
429
|
if launch_spec.get("name"):
|
440
430
|
name = launch_spec["name"]
|
@@ -451,7 +441,6 @@ def create_project_from_spec(launch_spec: Dict[str, Any], api: Api) -> LaunchPro
|
|
451
441
|
launch_spec.get("overrides", {}),
|
452
442
|
launch_spec.get("resource", None),
|
453
443
|
launch_spec.get("resource_args", {}),
|
454
|
-
launch_spec.get("cuda", None),
|
455
444
|
launch_spec.get("run_id", None),
|
456
445
|
)
|
457
446
|
|