wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. wandb/__init__.py +2 -3
  2. wandb/apis/__init__.py +1 -3
  3. wandb/apis/importers/__init__.py +4 -0
  4. wandb/apis/importers/base.py +312 -0
  5. wandb/apis/importers/mlflow.py +113 -0
  6. wandb/apis/internal.py +29 -2
  7. wandb/apis/normalize.py +6 -5
  8. wandb/apis/public.py +163 -180
  9. wandb/apis/reports/_templates.py +6 -12
  10. wandb/apis/reports/report.py +1 -1
  11. wandb/apis/reports/runset.py +1 -3
  12. wandb/apis/reports/util.py +12 -10
  13. wandb/beta/workflows.py +57 -34
  14. wandb/catboost/__init__.py +1 -2
  15. wandb/cli/cli.py +215 -133
  16. wandb/data_types.py +63 -56
  17. wandb/docker/__init__.py +78 -16
  18. wandb/docker/auth.py +21 -22
  19. wandb/env.py +0 -1
  20. wandb/errors/__init__.py +8 -116
  21. wandb/errors/term.py +1 -1
  22. wandb/fastai/__init__.py +1 -2
  23. wandb/filesync/dir_watcher.py +8 -5
  24. wandb/filesync/step_prepare.py +76 -75
  25. wandb/filesync/step_upload.py +1 -2
  26. wandb/integration/catboost/__init__.py +1 -3
  27. wandb/integration/catboost/catboost.py +8 -14
  28. wandb/integration/fastai/__init__.py +7 -13
  29. wandb/integration/gym/__init__.py +35 -4
  30. wandb/integration/keras/__init__.py +3 -3
  31. wandb/integration/keras/callbacks/metrics_logger.py +9 -8
  32. wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
  33. wandb/integration/keras/callbacks/tables_builder.py +31 -19
  34. wandb/integration/kfp/kfp_patch.py +20 -17
  35. wandb/integration/kfp/wandb_logging.py +1 -2
  36. wandb/integration/lightgbm/__init__.py +21 -19
  37. wandb/integration/prodigy/prodigy.py +6 -7
  38. wandb/integration/sacred/__init__.py +9 -12
  39. wandb/integration/sagemaker/__init__.py +1 -3
  40. wandb/integration/sagemaker/auth.py +0 -1
  41. wandb/integration/sagemaker/config.py +1 -1
  42. wandb/integration/sagemaker/resources.py +1 -1
  43. wandb/integration/sb3/sb3.py +8 -4
  44. wandb/integration/tensorboard/__init__.py +1 -3
  45. wandb/integration/tensorboard/log.py +8 -8
  46. wandb/integration/tensorboard/monkeypatch.py +11 -9
  47. wandb/integration/tensorflow/__init__.py +1 -3
  48. wandb/integration/xgboost/__init__.py +4 -6
  49. wandb/integration/yolov8/__init__.py +7 -0
  50. wandb/integration/yolov8/yolov8.py +250 -0
  51. wandb/jupyter.py +31 -35
  52. wandb/lightgbm/__init__.py +1 -2
  53. wandb/old/settings.py +2 -2
  54. wandb/plot/bar.py +1 -2
  55. wandb/plot/confusion_matrix.py +1 -3
  56. wandb/plot/histogram.py +1 -2
  57. wandb/plot/line.py +1 -2
  58. wandb/plot/line_series.py +4 -4
  59. wandb/plot/pr_curve.py +17 -20
  60. wandb/plot/roc_curve.py +1 -3
  61. wandb/plot/scatter.py +1 -2
  62. wandb/proto/v3/wandb_server_pb2.py +85 -39
  63. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  64. wandb/proto/v4/wandb_server_pb2.py +51 -39
  65. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  66. wandb/sdk/__init__.py +1 -3
  67. wandb/sdk/backend/backend.py +1 -1
  68. wandb/sdk/data_types/_dtypes.py +38 -30
  69. wandb/sdk/data_types/base_types/json_metadata.py +1 -3
  70. wandb/sdk/data_types/base_types/media.py +17 -17
  71. wandb/sdk/data_types/base_types/wb_value.py +33 -26
  72. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
  73. wandb/sdk/data_types/helper_types/classes.py +1 -1
  74. wandb/sdk/data_types/helper_types/image_mask.py +12 -12
  75. wandb/sdk/data_types/histogram.py +5 -4
  76. wandb/sdk/data_types/html.py +1 -2
  77. wandb/sdk/data_types/image.py +11 -11
  78. wandb/sdk/data_types/molecule.py +3 -6
  79. wandb/sdk/data_types/object_3d.py +1 -2
  80. wandb/sdk/data_types/plotly.py +1 -2
  81. wandb/sdk/data_types/saved_model.py +10 -8
  82. wandb/sdk/data_types/video.py +1 -1
  83. wandb/sdk/integration_utils/data_logging.py +5 -5
  84. wandb/sdk/interface/artifacts.py +288 -266
  85. wandb/sdk/interface/interface.py +2 -3
  86. wandb/sdk/interface/interface_grpc.py +1 -1
  87. wandb/sdk/interface/interface_queue.py +1 -1
  88. wandb/sdk/interface/interface_relay.py +1 -1
  89. wandb/sdk/interface/interface_shared.py +1 -2
  90. wandb/sdk/interface/interface_sock.py +1 -1
  91. wandb/sdk/interface/message_future.py +1 -1
  92. wandb/sdk/interface/message_future_poll.py +1 -1
  93. wandb/sdk/interface/router.py +1 -1
  94. wandb/sdk/interface/router_queue.py +1 -1
  95. wandb/sdk/interface/router_relay.py +1 -1
  96. wandb/sdk/interface/router_sock.py +1 -1
  97. wandb/sdk/interface/summary_record.py +1 -1
  98. wandb/sdk/internal/artifacts.py +1 -1
  99. wandb/sdk/internal/datastore.py +2 -3
  100. wandb/sdk/internal/file_pusher.py +5 -3
  101. wandb/sdk/internal/file_stream.py +22 -19
  102. wandb/sdk/internal/handler.py +5 -4
  103. wandb/sdk/internal/internal.py +1 -1
  104. wandb/sdk/internal/internal_api.py +115 -55
  105. wandb/sdk/internal/job_builder.py +1 -3
  106. wandb/sdk/internal/profiler.py +1 -1
  107. wandb/sdk/internal/progress.py +4 -6
  108. wandb/sdk/internal/sample.py +1 -3
  109. wandb/sdk/internal/sender.py +28 -16
  110. wandb/sdk/internal/settings_static.py +5 -5
  111. wandb/sdk/internal/system/assets/__init__.py +1 -0
  112. wandb/sdk/internal/system/assets/cpu.py +3 -9
  113. wandb/sdk/internal/system/assets/disk.py +2 -4
  114. wandb/sdk/internal/system/assets/gpu.py +6 -18
  115. wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
  116. wandb/sdk/internal/system/assets/interfaces.py +50 -22
  117. wandb/sdk/internal/system/assets/ipu.py +1 -3
  118. wandb/sdk/internal/system/assets/memory.py +7 -13
  119. wandb/sdk/internal/system/assets/network.py +4 -8
  120. wandb/sdk/internal/system/assets/open_metrics.py +283 -0
  121. wandb/sdk/internal/system/assets/tpu.py +1 -4
  122. wandb/sdk/internal/system/assets/trainium.py +26 -14
  123. wandb/sdk/internal/system/system_info.py +2 -3
  124. wandb/sdk/internal/system/system_monitor.py +52 -20
  125. wandb/sdk/internal/tb_watcher.py +12 -13
  126. wandb/sdk/launch/_project_spec.py +54 -65
  127. wandb/sdk/launch/agent/agent.py +374 -90
  128. wandb/sdk/launch/builder/abstract.py +61 -7
  129. wandb/sdk/launch/builder/build.py +81 -110
  130. wandb/sdk/launch/builder/docker_builder.py +181 -0
  131. wandb/sdk/launch/builder/kaniko_builder.py +419 -0
  132. wandb/sdk/launch/builder/noop.py +31 -12
  133. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
  134. wandb/sdk/launch/environment/abstract.py +28 -0
  135. wandb/sdk/launch/environment/aws_environment.py +276 -0
  136. wandb/sdk/launch/environment/gcp_environment.py +271 -0
  137. wandb/sdk/launch/environment/local_environment.py +65 -0
  138. wandb/sdk/launch/github_reference.py +3 -8
  139. wandb/sdk/launch/launch.py +38 -29
  140. wandb/sdk/launch/launch_add.py +6 -8
  141. wandb/sdk/launch/loader.py +230 -0
  142. wandb/sdk/launch/registry/abstract.py +54 -0
  143. wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
  144. wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
  145. wandb/sdk/launch/registry/local_registry.py +62 -0
  146. wandb/sdk/launch/runner/abstract.py +1 -16
  147. wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
  148. wandb/sdk/launch/runner/local_container.py +46 -22
  149. wandb/sdk/launch/runner/local_process.py +1 -4
  150. wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
  151. wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
  152. wandb/sdk/launch/sweeps/__init__.py +3 -2
  153. wandb/sdk/launch/sweeps/scheduler.py +132 -39
  154. wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
  155. wandb/sdk/launch/utils.py +101 -30
  156. wandb/sdk/launch/wandb_reference.py +2 -7
  157. wandb/sdk/lib/_settings_toposort_generate.py +166 -0
  158. wandb/sdk/lib/_settings_toposort_generated.py +201 -0
  159. wandb/sdk/lib/apikey.py +2 -4
  160. wandb/sdk/lib/config_util.py +4 -1
  161. wandb/sdk/lib/console.py +1 -3
  162. wandb/sdk/lib/deprecate.py +3 -3
  163. wandb/sdk/lib/file_stream_utils.py +7 -5
  164. wandb/sdk/lib/filenames.py +1 -1
  165. wandb/sdk/lib/filesystem.py +61 -5
  166. wandb/sdk/lib/git.py +1 -3
  167. wandb/sdk/lib/import_hooks.py +4 -7
  168. wandb/sdk/lib/ipython.py +8 -5
  169. wandb/sdk/lib/lazyloader.py +1 -3
  170. wandb/sdk/lib/mailbox.py +14 -4
  171. wandb/sdk/lib/proto_util.py +10 -5
  172. wandb/sdk/lib/redirect.py +15 -22
  173. wandb/sdk/lib/reporting.py +1 -3
  174. wandb/sdk/lib/retry.py +4 -5
  175. wandb/sdk/lib/runid.py +1 -3
  176. wandb/sdk/lib/server.py +15 -9
  177. wandb/sdk/lib/sock_client.py +1 -1
  178. wandb/sdk/lib/sparkline.py +1 -1
  179. wandb/sdk/lib/wburls.py +1 -1
  180. wandb/sdk/service/port_file.py +1 -2
  181. wandb/sdk/service/service.py +36 -13
  182. wandb/sdk/service/service_base.py +12 -1
  183. wandb/sdk/verify/verify.py +5 -7
  184. wandb/sdk/wandb_artifacts.py +142 -177
  185. wandb/sdk/wandb_config.py +5 -8
  186. wandb/sdk/wandb_helper.py +1 -1
  187. wandb/sdk/wandb_init.py +24 -13
  188. wandb/sdk/wandb_login.py +9 -9
  189. wandb/sdk/wandb_manager.py +39 -4
  190. wandb/sdk/wandb_metric.py +2 -6
  191. wandb/sdk/wandb_require.py +4 -15
  192. wandb/sdk/wandb_require_helpers.py +1 -9
  193. wandb/sdk/wandb_run.py +95 -141
  194. wandb/sdk/wandb_save.py +1 -3
  195. wandb/sdk/wandb_settings.py +149 -54
  196. wandb/sdk/wandb_setup.py +66 -46
  197. wandb/sdk/wandb_summary.py +13 -10
  198. wandb/sdk/wandb_sweep.py +6 -7
  199. wandb/sdk/wandb_watch.py +1 -1
  200. wandb/sklearn/calculate/confusion_matrix.py +1 -1
  201. wandb/sklearn/calculate/learning_curve.py +1 -1
  202. wandb/sklearn/calculate/summary_metrics.py +1 -3
  203. wandb/sklearn/plot/__init__.py +1 -1
  204. wandb/sklearn/plot/classifier.py +27 -18
  205. wandb/sklearn/plot/clusterer.py +4 -5
  206. wandb/sklearn/plot/regressor.py +4 -4
  207. wandb/sklearn/plot/shared.py +2 -2
  208. wandb/sync/__init__.py +1 -3
  209. wandb/sync/sync.py +4 -5
  210. wandb/testing/relay.py +11 -10
  211. wandb/trigger.py +1 -1
  212. wandb/util.py +106 -81
  213. wandb/viz.py +4 -4
  214. wandb/wandb_agent.py +50 -50
  215. wandb/wandb_controller.py +2 -3
  216. wandb/wandb_run.py +1 -2
  217. wandb/wandb_torch.py +1 -1
  218. wandb/xgboost/__init__.py +1 -2
  219. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
  220. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
  221. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
  222. wandb/sdk/launch/builder/docker.py +0 -80
  223. wandb/sdk/launch/builder/kaniko.py +0 -393
  224. wandb/sdk/launch/builder/loader.py +0 -32
  225. wandb/sdk/launch/runner/loader.py +0 -50
  226. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
  227. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
  228. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -92,9 +92,7 @@ class _Stats:
92
92
 
93
93
 
94
94
  class NeuronCoreStats:
95
- """
96
- AWS Trainium stats.
97
- """
95
+ """AWS Trainium stats."""
98
96
 
99
97
  name: str = "trn.{key}"
100
98
  samples: "Deque[_Stats]"
@@ -124,7 +122,7 @@ class NeuronCoreStats:
124
122
  ) as process:
125
123
  while not self.shutdown_event.is_set():
126
124
  if process.stdout is None:
127
- time.sleep(0.1)
125
+ self.shutdown_event.wait(0.1)
128
126
  continue
129
127
 
130
128
  raw_data = process.stdout.readline()
@@ -151,6 +149,15 @@ class NeuronCoreStats:
151
149
  self.samples: "Deque[_Stats]" = deque()
152
150
  self.shutdown_event = threading.Event()
153
151
 
152
+ self.neuron_monitor_thread: Optional[threading.Thread] = None
153
+
154
+ def setup(self) -> None:
155
+ """Start the neuron-monitor thread for collecting raw data."""
156
+ if self.neuron_monitor_thread is not None:
157
+ return
158
+
159
+ logger.debug("Starting neuron-monitor thread")
160
+ self.shutdown_event.clear()
154
161
  self.neuron_monitor_thread = threading.Thread(
155
162
  name="NeuronCoreMntr",
156
163
  target=self.neuron_monitor,
@@ -158,9 +165,20 @@ class NeuronCoreStats:
158
165
  )
159
166
  self.neuron_monitor_thread.start()
160
167
 
168
+ def teardown(self) -> None:
169
+ """Stop the neuron-monitor thread."""
170
+ logger.debug("Stopping neuron-monitor thread")
171
+ try:
172
+ self.shutdown_event.set()
173
+ assert self.neuron_monitor_thread is not None
174
+ self.neuron_monitor_thread.join()
175
+ except Exception as e:
176
+ logger.error("neuron-monitor thread failed to stop: %s" % e)
177
+ finally:
178
+ self.neuron_monitor_thread = None
179
+
161
180
  def _is_matching_entry(self, entry: dict) -> bool:
162
- """
163
- For now, only check if the pid in the entry matches the pid of the process.
181
+ """For now, only check if the pid in the entry matches the pid of the process.
164
182
 
165
183
  todo: add matching by neuron_runtime_tag
166
184
  """
@@ -218,9 +236,7 @@ class NeuronCoreStats:
218
236
 
219
237
  @staticmethod
220
238
  def flatten_stats(sample: _Stats) -> dict:
221
- """
222
- Flatten _Stats object into a flat dict of numbers.
223
- """
239
+ """Flatten _Stats object into a flat dict of numbers."""
224
240
  flattened = {}
225
241
 
226
242
  def helper(key: str, value: Any) -> None:
@@ -302,6 +318,7 @@ class Trainium:
302
318
  # on some systems that do not have the hardware
303
319
  try:
304
320
  # redirect stderr to null to avoid printing errors to the console
321
+ # todo: alternative: check /dev/neuron0 ? sysfs support coming soon in neuron tools
305
322
  output = subprocess.check_output(
306
323
  NEURON_LS_COMMAND,
307
324
  universal_newlines=True,
@@ -319,11 +336,6 @@ class Trainium:
319
336
 
320
337
  def finish(self) -> None:
321
338
  self.metrics_monitor.finish()
322
- # stop the raw data acquisition threads
323
- for metric in self.metrics:
324
- if hasattr(metric, "shutdown_event"):
325
- logger.debug("Stopping neuron-monitor thread")
326
- metric.shutdown_event.set()
327
339
 
328
340
  def probe(self) -> dict:
329
341
  try:
@@ -47,7 +47,7 @@ class SystemInfo:
47
47
 
48
48
  # todo: refactor these _save_* methods
49
49
  def _save_pip(self) -> None:
50
- """Saves the current working set of pip packages to {REQUIREMENTS_FNAME}"""
50
+ """Save the current working set of pip packages to {REQUIREMENTS_FNAME}."""
51
51
  logger.debug(
52
52
  "Saving list of pip packages installed into the current environment"
53
53
  )
@@ -220,8 +220,7 @@ class SystemInfo:
220
220
  if self.settings._jupyter_path.startswith("fileId="):
221
221
  unescaped = unquote(self.settings._jupyter_path)
222
222
  data["colab"] = (
223
- "https://colab.research.google.com/notebook#"
224
- + unescaped # noqa
223
+ "https://colab.research.google.com/notebook#" + unescaped
225
224
  )
226
225
  data["program"] = self.settings._jupyter_name
227
226
  else:
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
6
6
 
7
7
  from .assets.asset_registry import asset_registry
8
8
  from .assets.interfaces import Asset, Interface
9
+ from .assets.open_metrics import OpenMetrics
9
10
  from .system_info import SystemInfo
10
11
 
11
12
  if TYPE_CHECKING:
@@ -47,6 +48,8 @@ class SystemMonitor:
47
48
  self._shutdown_event: mp.synchronize.Event = mp.Event()
48
49
  self._process: Optional[Union[mp.Process, threading.Thread]] = None
49
50
 
51
+ self.settings = settings
52
+
50
53
  # settings._stats_join_assets controls whether we should join stats from different assets
51
54
  # before publishing them to the backend. If set to False, we will publish stats from each
52
55
  # asset separately, using the backend interface. If set to True, we will aggregate stats from
@@ -59,14 +62,16 @@ class SystemMonitor:
59
62
  sampling_interval: float = float(
60
63
  max(
61
64
  0.1,
62
- settings._stats_sample_rate_seconds,
65
+ self.settings._stats_sample_rate_seconds,
63
66
  )
64
67
  ) # seconds
65
68
  # The number of samples to aggregate (e.g. average or compute max/min etc.)
66
69
  # before publishing; defaults to 15; valid range: [1:30]
67
- samples_to_aggregate: int = min(30, max(1, settings._stats_samples_to_average))
70
+ samples_to_aggregate: int = min(
71
+ 30, max(1, self.settings._stats_samples_to_average)
72
+ )
68
73
  self.publishing_interval: float = sampling_interval * samples_to_aggregate
69
- self.join_assets: bool = settings._stats_join_assets
74
+ self.join_assets: bool = self.settings._stats_join_assets
70
75
 
71
76
  self.backend_interface = interface
72
77
  self.asset_interface: Optional[AssetInterface] = (
@@ -74,21 +79,47 @@ class SystemMonitor:
74
79
  )
75
80
 
76
81
  # hardware assets
77
- self.assets: List["Asset"] = []
78
- for asset_class in asset_registry:
79
- self.assets.append(
80
- asset_class(
81
- interface=self.asset_interface or self.backend_interface,
82
- settings=settings,
83
- shutdown_event=self._shutdown_event,
84
- )
85
- )
82
+ self.assets: List["Asset"] = self._get_assets()
83
+
84
+ # OpenMetrics/Prometheus-compatible endpoints
85
+ self.assets.extend(self._get_open_metrics_assets())
86
86
 
87
87
  # static system info, both hardware and software
88
88
  self.system_info: SystemInfo = SystemInfo(
89
- settings=settings, interface=interface
89
+ settings=self.settings, interface=interface
90
90
  )
91
91
 
92
+ def _get_assets(self) -> List["Asset"]:
93
+ return [
94
+ asset_class(
95
+ interface=self.asset_interface or self.backend_interface,
96
+ settings=self.settings,
97
+ shutdown_event=self._shutdown_event,
98
+ )
99
+ for asset_class in asset_registry
100
+ ]
101
+
102
+ def _get_open_metrics_assets(self) -> List["Asset"]:
103
+ open_metrics_endpoints = self.settings._stats_open_metrics_endpoints
104
+ if not open_metrics_endpoints:
105
+ return []
106
+
107
+ assets: List[Asset] = []
108
+ for name, endpoint in open_metrics_endpoints.items():
109
+ if not OpenMetrics.is_available(url=endpoint):
110
+ continue
111
+ logger.debug(f"Monitoring OpenMetrics endpoint: {endpoint}")
112
+ open_metrics = OpenMetrics(
113
+ interface=self.asset_interface or self.backend_interface,
114
+ settings=self.settings,
115
+ shutdown_event=self._shutdown_event,
116
+ name=name,
117
+ url=endpoint,
118
+ )
119
+ assets.append(open_metrics) # type: ignore
120
+
121
+ return assets
122
+
92
123
  def aggregate_and_publish_asset_metrics(self) -> None:
93
124
  if self.asset_interface is None:
94
125
  return None
@@ -147,13 +178,14 @@ class SystemMonitor:
147
178
 
148
179
  def start(self) -> None:
149
180
  self._shutdown_event.clear()
150
- if self._process is None:
151
- logger.info("Starting system monitor")
152
- # self._process = mp.Process(target=self._start, name="SystemMonitor")
153
- self._process = threading.Thread(
154
- target=self._start, daemon=True, name="SystemMonitor"
155
- )
156
- self._process.start()
181
+ if self._process is not None:
182
+ return None
183
+ logger.info("Starting system monitor")
184
+ # self._process = mp.Process(target=self._start, name="SystemMonitor")
185
+ self._process = threading.Thread(
186
+ target=self._start, daemon=True, name="SystemMonitor"
187
+ )
188
+ self._process.start()
157
189
 
158
190
  def finish(self) -> None:
159
191
  if self._process is None:
@@ -1,6 +1,4 @@
1
- """
2
- tensorboard watcher.
3
- """
1
+ """tensorboard watcher."""
4
2
 
5
3
  import glob
6
4
  import logging
@@ -61,7 +59,7 @@ def _link_and_save_file(
61
59
 
62
60
 
63
61
  def is_tfevents_file_created_by(path: str, hostname: str, start_time: float) -> bool:
64
- """Checks if a path is a tfevents file created by hostname.
62
+ """Check if a path is a tfevents file created by hostname.
65
63
 
66
64
  tensorboard tfevents filename format:
67
65
  https://github.com/tensorflow/tensorboard/blob/f3f26b46981da5bd46a5bb93fcf02d9eb7608bc1/tensorboard/summary/writer/event_file_writer.py#L81
@@ -95,7 +93,7 @@ def is_tfevents_file_created_by(path: str, hostname: str, start_time: float) ->
95
93
  # TODO: we should also check the PID (also contained in the tfevents
96
94
  # filename). Can we assume that our parent pid is the user process
97
95
  # that wrote these files?
98
- return created_time >= int(start_time) # noqa: W503
96
+ return created_time >= int(start_time)
99
97
 
100
98
 
101
99
  class TBWatcher:
@@ -216,7 +214,7 @@ class TBDirWatcher:
216
214
  self._thread.start()
217
215
 
218
216
  def _is_our_tfevents_file(self, path: str) -> bool:
219
- """Checks if a path has been modified since launch and contains tfevents"""
217
+ """Check if a path has been modified since launch and contains tfevents."""
220
218
  if not path:
221
219
  raise ValueError("Path must be a nonempty string")
222
220
  if self._force:
@@ -229,7 +227,7 @@ class TBDirWatcher:
229
227
  def _loader(
230
228
  self, save: bool = True, namespace: Optional[str] = None
231
229
  ) -> "EventFileLoader":
232
- """Incredibly hacky class generator to optionally save / prefix tfevent files"""
230
+ """Incredibly hacky class generator to optionally save / prefix tfevent files."""
233
231
  _loader_interface = self._tbwatcher._interface
234
232
  _loader_settings = self._tbwatcher._settings
235
233
  try:
@@ -285,7 +283,7 @@ class TBDirWatcher:
285
283
  raise e
286
284
 
287
285
  def _thread_body(self) -> None:
288
- """Check for new events every second"""
286
+ """Check for new events every second."""
289
287
  shutdown_time: Optional[float] = None
290
288
  while True:
291
289
  self._process_events()
@@ -318,7 +316,7 @@ class TBDirWatcher:
318
316
 
319
317
 
320
318
  class Event:
321
- """An event wrapper to enable priority queueing"""
319
+ """An event wrapper to enable priority queueing."""
322
320
 
323
321
  def __init__(self, event: "ProtoEvent", namespace: Optional[str]):
324
322
  self.event = event
@@ -332,10 +330,11 @@ class Event:
332
330
 
333
331
 
334
332
  class TBEventConsumer:
335
- """Consumes tfevents from a priority queue. There should always
336
- only be one of these per run_manager. We wait for 10 seconds of queued
337
- events to reduce the chance of multiple tfevent files triggering
338
- out of order steps.
333
+ """Consume tfevents from a priority queue.
334
+
335
+ There should always only be one of these per run_manager. We wait for 10 seconds of
336
+ queued events to reduce the chance of multiple tfevent files triggering out of order
337
+ steps.
339
338
  """
340
339
 
341
340
  def __init__(
@@ -1,8 +1,7 @@
1
+ """Convert launch arguments into a runnable wandb launch script.
2
+
3
+ Arguments can come from a launch spec or call to wandb launch.
1
4
  """
2
- Internal utility for converting arguments from a launch spec or call to wandb launch
3
- into a runnable wandb launch script
4
- """
5
- import binascii
6
5
  import enum
7
6
  import json
8
7
  import logging
@@ -15,11 +14,11 @@ import wandb
15
14
  import wandb.docker as docker
16
15
  from wandb.apis.internal import Api
17
16
  from wandb.apis.public import Artifact as PublicArtifact
18
- from wandb.errors import CommError, LaunchError
17
+ from wandb.errors import CommError
19
18
  from wandb.sdk.lib.runid import generate_id
20
19
 
21
20
  from . import utils
22
- from .utils import LOG_PREFIX
21
+ from .utils import LOG_PREFIX, LaunchError
23
22
 
24
23
  _logger = logging.getLogger(__name__)
25
24
 
@@ -60,7 +59,6 @@ class LaunchProject:
60
59
  overrides: Dict[str, Any],
61
60
  resource: str,
62
61
  resource_args: Dict[str, Any],
63
- cuda: Optional[bool],
64
62
  run_id: Optional[str],
65
63
  ):
66
64
  if uri is not None and utils.is_bare_wandb_uri(uri):
@@ -68,17 +66,24 @@ class LaunchProject:
68
66
  _logger.info(f"{LOG_PREFIX}Updating uri with base uri: {uri}")
69
67
  self.uri = uri
70
68
  self.job = job
71
- wandb.termlog(f"{LOG_PREFIX}Launch project got job {job}")
69
+ if job is not None:
70
+ wandb.termlog(f"{LOG_PREFIX}Launching job: {job}")
72
71
  self._job_artifact: Optional[PublicArtifact] = None
73
72
  self.api = api
74
73
  self.launch_spec = launch_spec
75
74
  self.target_entity = target_entity
76
75
  self.target_project = target_project.lower()
77
76
  self.name = name # TODO: replace with run_id
77
+ # the builder key can be passed in through the resource args
78
+ # but these resource_args are then passed to the appropriate
79
+ # runner, so we need to pop the builder key out
80
+ resource_args_build = resource_args.get(resource, {}).pop("builder", {})
78
81
  self.resource = resource
79
82
  self.resource_args = resource_args
80
83
  self.python_version: Optional[str] = launch_spec.get("python_version")
81
- self.cuda_version: Optional[str] = launch_spec.get("cuda_version")
84
+ self.cuda_base_image: Optional[str] = resource_args_build.get("cuda", {}).get(
85
+ "base_image"
86
+ )
82
87
  self._base_image: Optional[str] = launch_spec.get("base_image")
83
88
  self.docker_image: Optional[str] = docker_config.get(
84
89
  "docker_image"
@@ -95,11 +100,8 @@ class LaunchProject:
95
100
  self.override_artifacts: Dict[str, Any] = overrides.get("artifacts", {})
96
101
  self.override_entrypoint: Optional[EntryPoint] = None
97
102
  self.deps_type: Optional[str] = None
98
- self.cuda = cuda
99
103
  self._runtime: Optional[str] = None
100
104
  self.run_id = run_id or generate_id()
101
- self._image_tag: str = self._initialize_image_job_tag() or self.run_id
102
- wandb.termlog(f"{LOG_PREFIX}Launch project using image tag {self._image_tag}")
103
105
  self._entry_points: Dict[
104
106
  str, EntryPoint
105
107
  ] = {} # todo: keep multiple entrypoint support?
@@ -139,15 +141,13 @@ class LaunchProject:
139
141
  )
140
142
  self.source = LaunchSource.LOCAL
141
143
  self.project_dir = self.uri
142
- if launch_spec.get("resource_args"):
143
- self.resource_args = launch_spec["resource_args"]
144
144
 
145
145
  self.aux_dir = tempfile.mkdtemp()
146
146
  self.clear_parameter_run_config_collisions()
147
147
 
148
148
  @property
149
149
  def base_image(self) -> str:
150
- """Returns {PROJECT}_base:{PYTHON_VERSION}"""
150
+ """Returns {PROJECT}_base:{PYTHON_VERSION}."""
151
151
  # TODO: this should likely be source_project when we have it...
152
152
 
153
153
  # don't make up a separate base image name if user provides a docker image
@@ -174,25 +174,15 @@ class LaunchProject:
174
174
  assert self.job is not None
175
175
  return wandb.util.make_docker_image_name_safe(self.job.split(":")[0])
176
176
 
177
- def _initialize_image_job_tag(self) -> Optional[str]:
178
- if self.job is not None:
179
- job_name, alias = self.job.split(":")
180
- # Alias is used to differentiate images between jobs of the same sequence
181
- _image_tag = f"{alias}-{job_name}"
182
- _logger.debug(f"{LOG_PREFIX}Setting image tag {_image_tag}")
183
- return wandb.util.make_docker_image_name_safe(_image_tag)
184
- return None
185
-
186
- @property
187
- def image_uri(self) -> str:
188
- if self.docker_image:
189
- return self.docker_image
190
- return f"{self.image_name}:{self.image_tag}"
191
-
192
- @property
193
- def image_tag(self) -> str:
194
-
195
- return self._image_tag[:IMAGE_TAG_MAX_LENGTH]
177
+ def build_required(self) -> bool:
178
+ """Checks the source to see if a build is required."""
179
+ # since the image tag for images built from jobs
180
+ # is based on the job version index, which is immutable
181
+ # we don't need to build the image for a job if that tag
182
+ # already exists
183
+ if self.source != LaunchSource.JOB:
184
+ return True
185
+ return False
196
186
 
197
187
  @property
198
188
  def docker_image(self) -> Optional[str]:
@@ -225,7 +215,7 @@ class LaunchProject:
225
215
  return list(self._entry_points.values())[0]
226
216
 
227
217
  def add_entry_point(self, command: List[str]) -> "EntryPoint":
228
- """Adds an entry point to the project."""
218
+ """Add an entry point to the project."""
229
219
  entry_point = command[-1]
230
220
  new_entrypoint = EntryPoint(name=entry_point, command=command)
231
221
  self._entry_points[entry_point] = new_entrypoint
@@ -243,10 +233,37 @@ class LaunchProject:
243
233
  try:
244
234
  job = public_api.job(self.job, path=job_dir)
245
235
  except CommError:
246
- raise LaunchError(f"Job {self.job} not found")
236
+ raise LaunchError(
237
+ f"Job {self.job} not found. Jobs have the format: <entity>/<project>/<name>:<alias>"
238
+ )
247
239
  job.configure_launch_project(self)
248
240
  self._job_artifact = job._job_artifact
249
241
 
242
+ def get_image_source_string(self) -> str:
243
+ """Returns a unique string identifying the source of an image."""
244
+ if self.source == LaunchSource.LOCAL:
245
+ # TODO: more correct to get a hash of local uri contents
246
+ assert isinstance(self.uri, str)
247
+ return self.uri
248
+ elif self.source == LaunchSource.JOB:
249
+ assert self._job_artifact is not None
250
+ return f"{self._job_artifact.name}:v{self._job_artifact.version}"
251
+ elif self.source == LaunchSource.GIT:
252
+ assert isinstance(self.uri, str)
253
+ ret = self.uri
254
+ if self.git_version:
255
+ ret += self.git_version
256
+ return ret
257
+ elif self.source == LaunchSource.WANDB:
258
+ assert isinstance(self.uri, str)
259
+ return self.uri
260
+ elif self.source == LaunchSource.DOCKER:
261
+ assert isinstance(self.docker_image, str)
262
+ _logger.debug("")
263
+ return self.docker_image
264
+ else:
265
+ raise LaunchError("Unknown source type when determing image source string")
266
+
250
267
  def _fetch_project_local(self, internal_api: Api) -> None:
251
268
  """Fetch a project (either wandb run or git repo) into a local directory, returning the path to the local project directory."""
252
269
  # these asserts are all guaranteed to pass, but are required by mypy
@@ -263,24 +280,6 @@ class LaunchProject:
263
280
  )
264
281
  program_name = run_info.get("codePath") or run_info["program"]
265
282
 
266
- if run_info.get("cudaVersion"):
267
- original_cuda_version = ".".join(run_info["cudaVersion"].split(".")[:2])
268
-
269
- if self.cuda is None:
270
- # only set cuda on by default if cuda is None (unspecified), not False (user specifically requested cpu image)
271
- wandb.termlog(
272
- f"{LOG_PREFIX}Original wandb run {source_run_name} was run with cuda version {original_cuda_version}. Enabling cuda builds by default; to build on a CPU-only image, run again with --cuda=False"
273
- )
274
- self.cuda_version = original_cuda_version
275
- self.cuda = True
276
- if (
277
- self.cuda
278
- and self.cuda_version
279
- and self.cuda_version != original_cuda_version
280
- ):
281
- wandb.termlog(
282
- f"{LOG_PREFIX}Specified cuda version {self.cuda_version} differs from original cuda version {original_cuda_version}. Running with specified version {self.cuda_version}"
283
- )
284
283
  self.python_version = run_info.get("python", "3")
285
284
  downloaded_code_artifact = utils.check_and_download_code_artifacts(
286
285
  source_entity,
@@ -289,11 +288,7 @@ class LaunchProject:
289
288
  internal_api,
290
289
  self.project_dir,
291
290
  )
292
- if downloaded_code_artifact:
293
- self._image_tag = binascii.hexlify(
294
- downloaded_code_artifact.digest.encode()
295
- ).decode()
296
- else:
291
+ if not downloaded_code_artifact:
297
292
  if not run_info["git"]:
298
293
  raise LaunchError(
299
294
  "Reproducing a run requires either an associated git repo or a code artifact logged with `run.log_code()`"
@@ -308,12 +303,8 @@ class LaunchProject:
308
303
  patch = utils.fetch_project_diff(
309
304
  source_entity, source_project, source_run_name, internal_api
310
305
  )
311
- tag_string = run_info["git"]["remote"] + run_info["git"]["commit"]
312
306
  if patch:
313
307
  utils.apply_patch(patch, self.project_dir)
314
- tag_string += patch
315
-
316
- self._image_tag = binascii.hexlify(tag_string.encode()).decode()
317
308
 
318
309
  # For cases where the entry point wasn't checked into git
319
310
  if not os.path.exists(os.path.join(self.project_dir, program_name)):
@@ -434,7 +425,6 @@ def create_project_from_spec(launch_spec: Dict[str, Any], api: Api) -> LaunchPro
434
425
  Returns:
435
426
  An initialized `LaunchProject` object
436
427
  """
437
-
438
428
  name: Optional[str] = None
439
429
  if launch_spec.get("name"):
440
430
  name = launch_spec["name"]
@@ -451,7 +441,6 @@ def create_project_from_spec(launch_spec: Dict[str, Any], api: Api) -> LaunchPro
451
441
  launch_spec.get("overrides", {}),
452
442
  launch_spec.get("resource", None),
453
443
  launch_spec.get("resource_args", {}),
454
- launch_spec.get("cuda", None),
455
444
  launch_spec.get("run_id", None),
456
445
  )
457
446