wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. wandb/__init__.py +2 -3
  2. wandb/apis/__init__.py +1 -3
  3. wandb/apis/importers/__init__.py +4 -0
  4. wandb/apis/importers/base.py +312 -0
  5. wandb/apis/importers/mlflow.py +113 -0
  6. wandb/apis/internal.py +29 -2
  7. wandb/apis/normalize.py +6 -5
  8. wandb/apis/public.py +163 -180
  9. wandb/apis/reports/_templates.py +6 -12
  10. wandb/apis/reports/report.py +1 -1
  11. wandb/apis/reports/runset.py +1 -3
  12. wandb/apis/reports/util.py +12 -10
  13. wandb/beta/workflows.py +57 -34
  14. wandb/catboost/__init__.py +1 -2
  15. wandb/cli/cli.py +215 -133
  16. wandb/data_types.py +63 -56
  17. wandb/docker/__init__.py +78 -16
  18. wandb/docker/auth.py +21 -22
  19. wandb/env.py +0 -1
  20. wandb/errors/__init__.py +8 -116
  21. wandb/errors/term.py +1 -1
  22. wandb/fastai/__init__.py +1 -2
  23. wandb/filesync/dir_watcher.py +8 -5
  24. wandb/filesync/step_prepare.py +76 -75
  25. wandb/filesync/step_upload.py +1 -2
  26. wandb/integration/catboost/__init__.py +1 -3
  27. wandb/integration/catboost/catboost.py +8 -14
  28. wandb/integration/fastai/__init__.py +7 -13
  29. wandb/integration/gym/__init__.py +35 -4
  30. wandb/integration/keras/__init__.py +3 -3
  31. wandb/integration/keras/callbacks/metrics_logger.py +9 -8
  32. wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
  33. wandb/integration/keras/callbacks/tables_builder.py +31 -19
  34. wandb/integration/kfp/kfp_patch.py +20 -17
  35. wandb/integration/kfp/wandb_logging.py +1 -2
  36. wandb/integration/lightgbm/__init__.py +21 -19
  37. wandb/integration/prodigy/prodigy.py +6 -7
  38. wandb/integration/sacred/__init__.py +9 -12
  39. wandb/integration/sagemaker/__init__.py +1 -3
  40. wandb/integration/sagemaker/auth.py +0 -1
  41. wandb/integration/sagemaker/config.py +1 -1
  42. wandb/integration/sagemaker/resources.py +1 -1
  43. wandb/integration/sb3/sb3.py +8 -4
  44. wandb/integration/tensorboard/__init__.py +1 -3
  45. wandb/integration/tensorboard/log.py +8 -8
  46. wandb/integration/tensorboard/monkeypatch.py +11 -9
  47. wandb/integration/tensorflow/__init__.py +1 -3
  48. wandb/integration/xgboost/__init__.py +4 -6
  49. wandb/integration/yolov8/__init__.py +7 -0
  50. wandb/integration/yolov8/yolov8.py +250 -0
  51. wandb/jupyter.py +31 -35
  52. wandb/lightgbm/__init__.py +1 -2
  53. wandb/old/settings.py +2 -2
  54. wandb/plot/bar.py +1 -2
  55. wandb/plot/confusion_matrix.py +1 -3
  56. wandb/plot/histogram.py +1 -2
  57. wandb/plot/line.py +1 -2
  58. wandb/plot/line_series.py +4 -4
  59. wandb/plot/pr_curve.py +17 -20
  60. wandb/plot/roc_curve.py +1 -3
  61. wandb/plot/scatter.py +1 -2
  62. wandb/proto/v3/wandb_server_pb2.py +85 -39
  63. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  64. wandb/proto/v4/wandb_server_pb2.py +51 -39
  65. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  66. wandb/sdk/__init__.py +1 -3
  67. wandb/sdk/backend/backend.py +1 -1
  68. wandb/sdk/data_types/_dtypes.py +38 -30
  69. wandb/sdk/data_types/base_types/json_metadata.py +1 -3
  70. wandb/sdk/data_types/base_types/media.py +17 -17
  71. wandb/sdk/data_types/base_types/wb_value.py +33 -26
  72. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
  73. wandb/sdk/data_types/helper_types/classes.py +1 -1
  74. wandb/sdk/data_types/helper_types/image_mask.py +12 -12
  75. wandb/sdk/data_types/histogram.py +5 -4
  76. wandb/sdk/data_types/html.py +1 -2
  77. wandb/sdk/data_types/image.py +11 -11
  78. wandb/sdk/data_types/molecule.py +3 -6
  79. wandb/sdk/data_types/object_3d.py +1 -2
  80. wandb/sdk/data_types/plotly.py +1 -2
  81. wandb/sdk/data_types/saved_model.py +10 -8
  82. wandb/sdk/data_types/video.py +1 -1
  83. wandb/sdk/integration_utils/data_logging.py +5 -5
  84. wandb/sdk/interface/artifacts.py +288 -266
  85. wandb/sdk/interface/interface.py +2 -3
  86. wandb/sdk/interface/interface_grpc.py +1 -1
  87. wandb/sdk/interface/interface_queue.py +1 -1
  88. wandb/sdk/interface/interface_relay.py +1 -1
  89. wandb/sdk/interface/interface_shared.py +1 -2
  90. wandb/sdk/interface/interface_sock.py +1 -1
  91. wandb/sdk/interface/message_future.py +1 -1
  92. wandb/sdk/interface/message_future_poll.py +1 -1
  93. wandb/sdk/interface/router.py +1 -1
  94. wandb/sdk/interface/router_queue.py +1 -1
  95. wandb/sdk/interface/router_relay.py +1 -1
  96. wandb/sdk/interface/router_sock.py +1 -1
  97. wandb/sdk/interface/summary_record.py +1 -1
  98. wandb/sdk/internal/artifacts.py +1 -1
  99. wandb/sdk/internal/datastore.py +2 -3
  100. wandb/sdk/internal/file_pusher.py +5 -3
  101. wandb/sdk/internal/file_stream.py +22 -19
  102. wandb/sdk/internal/handler.py +5 -4
  103. wandb/sdk/internal/internal.py +1 -1
  104. wandb/sdk/internal/internal_api.py +115 -55
  105. wandb/sdk/internal/job_builder.py +1 -3
  106. wandb/sdk/internal/profiler.py +1 -1
  107. wandb/sdk/internal/progress.py +4 -6
  108. wandb/sdk/internal/sample.py +1 -3
  109. wandb/sdk/internal/sender.py +28 -16
  110. wandb/sdk/internal/settings_static.py +5 -5
  111. wandb/sdk/internal/system/assets/__init__.py +1 -0
  112. wandb/sdk/internal/system/assets/cpu.py +3 -9
  113. wandb/sdk/internal/system/assets/disk.py +2 -4
  114. wandb/sdk/internal/system/assets/gpu.py +6 -18
  115. wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
  116. wandb/sdk/internal/system/assets/interfaces.py +50 -22
  117. wandb/sdk/internal/system/assets/ipu.py +1 -3
  118. wandb/sdk/internal/system/assets/memory.py +7 -13
  119. wandb/sdk/internal/system/assets/network.py +4 -8
  120. wandb/sdk/internal/system/assets/open_metrics.py +283 -0
  121. wandb/sdk/internal/system/assets/tpu.py +1 -4
  122. wandb/sdk/internal/system/assets/trainium.py +26 -14
  123. wandb/sdk/internal/system/system_info.py +2 -3
  124. wandb/sdk/internal/system/system_monitor.py +52 -20
  125. wandb/sdk/internal/tb_watcher.py +12 -13
  126. wandb/sdk/launch/_project_spec.py +54 -65
  127. wandb/sdk/launch/agent/agent.py +374 -90
  128. wandb/sdk/launch/builder/abstract.py +61 -7
  129. wandb/sdk/launch/builder/build.py +81 -110
  130. wandb/sdk/launch/builder/docker_builder.py +181 -0
  131. wandb/sdk/launch/builder/kaniko_builder.py +419 -0
  132. wandb/sdk/launch/builder/noop.py +31 -12
  133. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
  134. wandb/sdk/launch/environment/abstract.py +28 -0
  135. wandb/sdk/launch/environment/aws_environment.py +276 -0
  136. wandb/sdk/launch/environment/gcp_environment.py +271 -0
  137. wandb/sdk/launch/environment/local_environment.py +65 -0
  138. wandb/sdk/launch/github_reference.py +3 -8
  139. wandb/sdk/launch/launch.py +38 -29
  140. wandb/sdk/launch/launch_add.py +6 -8
  141. wandb/sdk/launch/loader.py +230 -0
  142. wandb/sdk/launch/registry/abstract.py +54 -0
  143. wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
  144. wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
  145. wandb/sdk/launch/registry/local_registry.py +62 -0
  146. wandb/sdk/launch/runner/abstract.py +1 -16
  147. wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
  148. wandb/sdk/launch/runner/local_container.py +46 -22
  149. wandb/sdk/launch/runner/local_process.py +1 -4
  150. wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
  151. wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
  152. wandb/sdk/launch/sweeps/__init__.py +3 -2
  153. wandb/sdk/launch/sweeps/scheduler.py +132 -39
  154. wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
  155. wandb/sdk/launch/utils.py +101 -30
  156. wandb/sdk/launch/wandb_reference.py +2 -7
  157. wandb/sdk/lib/_settings_toposort_generate.py +166 -0
  158. wandb/sdk/lib/_settings_toposort_generated.py +201 -0
  159. wandb/sdk/lib/apikey.py +2 -4
  160. wandb/sdk/lib/config_util.py +4 -1
  161. wandb/sdk/lib/console.py +1 -3
  162. wandb/sdk/lib/deprecate.py +3 -3
  163. wandb/sdk/lib/file_stream_utils.py +7 -5
  164. wandb/sdk/lib/filenames.py +1 -1
  165. wandb/sdk/lib/filesystem.py +61 -5
  166. wandb/sdk/lib/git.py +1 -3
  167. wandb/sdk/lib/import_hooks.py +4 -7
  168. wandb/sdk/lib/ipython.py +8 -5
  169. wandb/sdk/lib/lazyloader.py +1 -3
  170. wandb/sdk/lib/mailbox.py +14 -4
  171. wandb/sdk/lib/proto_util.py +10 -5
  172. wandb/sdk/lib/redirect.py +15 -22
  173. wandb/sdk/lib/reporting.py +1 -3
  174. wandb/sdk/lib/retry.py +4 -5
  175. wandb/sdk/lib/runid.py +1 -3
  176. wandb/sdk/lib/server.py +15 -9
  177. wandb/sdk/lib/sock_client.py +1 -1
  178. wandb/sdk/lib/sparkline.py +1 -1
  179. wandb/sdk/lib/wburls.py +1 -1
  180. wandb/sdk/service/port_file.py +1 -2
  181. wandb/sdk/service/service.py +36 -13
  182. wandb/sdk/service/service_base.py +12 -1
  183. wandb/sdk/verify/verify.py +5 -7
  184. wandb/sdk/wandb_artifacts.py +142 -177
  185. wandb/sdk/wandb_config.py +5 -8
  186. wandb/sdk/wandb_helper.py +1 -1
  187. wandb/sdk/wandb_init.py +24 -13
  188. wandb/sdk/wandb_login.py +9 -9
  189. wandb/sdk/wandb_manager.py +39 -4
  190. wandb/sdk/wandb_metric.py +2 -6
  191. wandb/sdk/wandb_require.py +4 -15
  192. wandb/sdk/wandb_require_helpers.py +1 -9
  193. wandb/sdk/wandb_run.py +95 -141
  194. wandb/sdk/wandb_save.py +1 -3
  195. wandb/sdk/wandb_settings.py +149 -54
  196. wandb/sdk/wandb_setup.py +66 -46
  197. wandb/sdk/wandb_summary.py +13 -10
  198. wandb/sdk/wandb_sweep.py +6 -7
  199. wandb/sdk/wandb_watch.py +1 -1
  200. wandb/sklearn/calculate/confusion_matrix.py +1 -1
  201. wandb/sklearn/calculate/learning_curve.py +1 -1
  202. wandb/sklearn/calculate/summary_metrics.py +1 -3
  203. wandb/sklearn/plot/__init__.py +1 -1
  204. wandb/sklearn/plot/classifier.py +27 -18
  205. wandb/sklearn/plot/clusterer.py +4 -5
  206. wandb/sklearn/plot/regressor.py +4 -4
  207. wandb/sklearn/plot/shared.py +2 -2
  208. wandb/sync/__init__.py +1 -3
  209. wandb/sync/sync.py +4 -5
  210. wandb/testing/relay.py +11 -10
  211. wandb/trigger.py +1 -1
  212. wandb/util.py +106 -81
  213. wandb/viz.py +4 -4
  214. wandb/wandb_agent.py +50 -50
  215. wandb/wandb_controller.py +2 -3
  216. wandb/wandb_run.py +1 -2
  217. wandb/wandb_torch.py +1 -1
  218. wandb/xgboost/__init__.py +1 -2
  219. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
  220. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
  221. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
  222. wandb/sdk/launch/builder/docker.py +0 -80
  223. wandb/sdk/launch/builder/kaniko.py +0 -393
  224. wandb/sdk/launch/builder/loader.py +0 -32
  225. wandb/sdk/launch/runner/loader.py +0 -50
  226. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
  227. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
  228. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -18,9 +18,7 @@ if TYPE_CHECKING:
18
18
 
19
19
 
20
20
  class DiskUsage:
21
- """
22
- Total system disk usage in percent.
23
- """
21
+ """Total system disk usage in percent."""
24
22
 
25
23
  # name = "disk_usage"
26
24
  name = "disk"
@@ -62,7 +60,7 @@ class Disk:
62
60
 
63
61
  @classmethod
64
62
  def is_available(cls) -> bool:
65
- """Return a new instance of the CPU metrics"""
63
+ """Return a new instance of the CPU metrics."""
66
64
  return psutil is not None
67
65
 
68
66
  def probe(self) -> dict:
@@ -55,9 +55,7 @@ def gpu_in_use_by_this_process(gpu_handle: "GPUHandle", pid: int) -> bool:
55
55
 
56
56
 
57
57
  class GPUMemoryUtilization:
58
- """
59
- GPU memory utilization in percent for each GPU.
60
- """
58
+ """GPU memory utilization in percent for each GPU."""
61
59
 
62
60
  # name = "memory_utilization"
63
61
  name = "gpu.{}.memory"
@@ -99,9 +97,7 @@ class GPUMemoryUtilization:
99
97
 
100
98
 
101
99
  class GPUMemoryAllocated:
102
- """
103
- GPU memory allocated in percent for each GPU.
104
- """
100
+ """GPU memory allocated in percent for each GPU."""
105
101
 
106
102
  # name = "memory_allocated"
107
103
  name = "gpu.{}.memoryAllocated"
@@ -142,9 +138,7 @@ class GPUMemoryAllocated:
142
138
 
143
139
 
144
140
  class GPUUtilization:
145
- """
146
- GPU utilization in percent for each GPU.
147
- """
141
+ """GPU utilization in percent for each GPU."""
148
142
 
149
143
  # name = "gpu_utilization"
150
144
  name = "gpu.{}.gpu"
@@ -186,9 +180,7 @@ class GPUUtilization:
186
180
 
187
181
 
188
182
  class GPUTemperature:
189
- """
190
- GPU temperature in Celsius for each GPU.
191
- """
183
+ """GPU temperature in Celsius for each GPU."""
192
184
 
193
185
  # name = "gpu_temperature"
194
186
  name = "gpu.{}.temp"
@@ -233,9 +225,7 @@ class GPUTemperature:
233
225
 
234
226
 
235
227
  class GPUPowerUsageWatts:
236
- """
237
- GPU power usage in Watts for each GPU.
238
- """
228
+ """GPU power usage in Watts for each GPU."""
239
229
 
240
230
  name = "gpu.{}.powerWatts"
241
231
  # samples: Deque[Tuple[datetime.datetime, float]]
@@ -273,9 +263,7 @@ class GPUPowerUsageWatts:
273
263
 
274
264
 
275
265
  class GPUPowerUsagePercent:
276
- """
277
- GPU power usage in percent for each GPU.
278
- """
266
+ """GPU power usage in percent for each GPU."""
279
267
 
280
268
  name = "gpu.{}.powerPercent"
281
269
  # samples: Deque[Tuple[datetime.datetime, float]]
@@ -34,13 +34,11 @@ class _Stats(TypedDict):
34
34
  temp: float
35
35
  powerWatts: float # noqa: N815
36
36
  powerPercent: float # noqa: N815
37
- # cpuWaitMs: float # noqa: N815
37
+ # cpuWaitMs: float
38
38
 
39
39
 
40
40
  class GPUAppleStats:
41
- """
42
- Apple GPU stats available on Arm Macs.
43
- """
41
+ """Apple GPU stats available on Arm Macs."""
44
42
 
45
43
  name = "gpu.0.{}"
46
44
  samples: "Deque[_Stats]"
@@ -26,29 +26,43 @@ logger = logging.getLogger(__name__)
26
26
 
27
27
 
28
28
  class Metric(Protocol):
29
- """
30
- Base protocol for individual metrics
31
- """
29
+ """Base protocol for individual metrics."""
32
30
 
33
31
  name: str
34
32
  # samples: Sequence[Tuple[TimeStamp, Sample]]
35
33
  samples: "Deque[Any]"
36
34
 
37
35
  def sample(self) -> None:
36
+ """Sample the metric."""
38
37
  ... # pragma: no cover
39
38
 
40
39
  def clear(self) -> None:
40
+ """Clear the samples."""
41
41
  ... # pragma: no cover
42
42
 
43
43
  def aggregate(self) -> dict:
44
+ """Aggregate the samples."""
45
+ ... # pragma: no cover
46
+
47
+
48
+ @runtime_checkable
49
+ class SetupTeardown(Protocol):
50
+ """Protocol for classes that require setup and teardown."""
51
+
52
+ def setup(self) -> None:
53
+ """Extra setup required for the metric beyond __init__."""
54
+ ... # pragma: no cover
55
+
56
+ def teardown(self) -> None:
57
+ """Extra teardown required for the metric."""
44
58
  ... # pragma: no cover
45
59
 
46
60
 
47
61
  @runtime_checkable
48
62
  class Asset(Protocol):
49
- """
50
- Base protocol to encapsulate everything relating to an "Asset"
51
- e.g. CPU, GPU, TPU, Network, I/O etc.
63
+ """Base protocol encapsulate everything relating to an "Asset".
64
+
65
+ An asset can be CPU, GPU, TPU, Network, I/O etc.
52
66
  """
53
67
 
54
68
  name: str
@@ -60,19 +74,19 @@ class Asset(Protocol):
60
74
 
61
75
  @classmethod
62
76
  def is_available(cls) -> bool:
63
- """Check if the resource is available"""
77
+ """Check if the resource is available."""
64
78
  ... # pragma: no cover
65
79
 
66
80
  def start(self) -> None:
67
- """Start monitoring the resource"""
81
+ """Start monitoring the resource."""
68
82
  ... # pragma: no cover
69
83
 
70
84
  def finish(self) -> None:
71
- """finish monitoring the resource"""
85
+ """Finish monitoring the resource."""
72
86
  ... # pragma: no cover
73
87
 
74
88
  def probe(self) -> dict:
75
- """Get static information about the resource"""
89
+ """Get static information about the resource."""
76
90
  ... # pragma: no cover
77
91
 
78
92
 
@@ -88,9 +102,7 @@ class Interface(Protocol):
88
102
 
89
103
 
90
104
  class MetricsMonitor:
91
- """
92
- Takes care of collecting, sampling, serializing, and publishing a set of metrics.
93
- """
105
+ """Takes care of collecting, sampling, serializing, and publishing a set of metrics."""
94
106
 
95
107
  def __init__(
96
108
  self,
@@ -119,7 +131,7 @@ class MetricsMonitor:
119
131
  )
120
132
 
121
133
  def monitor(self) -> None:
122
- """Poll the Asset metrics"""
134
+ """Poll the Asset metrics."""
123
135
  while not self._shutdown_event.is_set():
124
136
  for _ in range(self.samples_to_aggregate):
125
137
  for metric in self.metrics:
@@ -133,7 +145,7 @@ class MetricsMonitor:
133
145
  self.publish()
134
146
 
135
147
  def aggregate(self) -> dict:
136
- """Return a dict of metrics"""
148
+ """Return a dict of metrics."""
137
149
  aggregated_metrics = {}
138
150
  for metric in self.metrics:
139
151
  try:
@@ -147,7 +159,7 @@ class MetricsMonitor:
147
159
  return aggregated_metrics
148
160
 
149
161
  def publish(self) -> None:
150
- """Publish the Asset metrics"""
162
+ """Publish the Asset metrics."""
151
163
  try:
152
164
  aggregated_metrics = self.aggregate()
153
165
  if aggregated_metrics:
@@ -158,21 +170,37 @@ class MetricsMonitor:
158
170
  logger.error(f"Failed to publish metrics: {e}")
159
171
 
160
172
  def start(self) -> None:
161
- if self._process is None and not self._shutdown_event.is_set():
173
+ if (self._process is not None) or self._shutdown_event.is_set():
174
+ return None
175
+
176
+ thread_name = f"{self.asset_name[:15]}" # thread names are limited to 15 chars
177
+ try:
178
+ for metric in self.metrics:
179
+ if isinstance(metric, SetupTeardown):
180
+ metric.setup()
162
181
  self._process = threading.Thread(
163
182
  target=self.monitor,
164
183
  daemon=True,
165
- name=f"{self.asset_name}",
184
+ name=thread_name,
166
185
  )
167
186
  self._process.start()
168
- logger.info(f"Started {self._process.name}")
187
+ logger.info(f"Started {thread_name} monitoring")
188
+ except Exception as e:
189
+ logger.warning(f"Failed to start {thread_name} monitoring: {e}")
190
+ self._process = None
169
191
 
170
192
  def finish(self) -> None:
171
193
  if self._process is None:
172
194
  return None
195
+
196
+ thread_name = f"{self.asset_name[:15]}"
173
197
  try:
174
198
  self._process.join()
175
- logger.info(f"Joined {self._process.name}")
199
+ logger.info(f"Joined {thread_name} monitor")
200
+ for metric in self.metrics:
201
+ if isinstance(metric, SetupTeardown):
202
+ metric.teardown()
176
203
  except Exception as e:
177
- logger.warning(f"Failed to join {self._process.name}: {e}")
178
- self._process = None
204
+ logger.warning(f"Failed to finish {thread_name} monitoring: {e}")
205
+ finally:
206
+ self._process = None
@@ -20,9 +20,7 @@ if TYPE_CHECKING:
20
20
 
21
21
 
22
22
  class IPUStats:
23
- """
24
- Stats for Graphcore IPU devices
25
- """
23
+ """Stats for Graphcore IPU devices."""
26
24
 
27
25
  name = "ipu.{}.{}"
28
26
  samples: "Deque[dict]"
@@ -18,8 +18,8 @@ if TYPE_CHECKING:
18
18
 
19
19
 
20
20
  class ProcessMemoryRSS:
21
- """
22
- Memory resident set size (RSS) in MB.
21
+ """Memory resident set size (RSS) in MB.
22
+
23
23
  RSS is the portion of memory occupied by a process that is held in main memory (RAM).
24
24
  """
25
25
 
@@ -49,9 +49,7 @@ class ProcessMemoryRSS:
49
49
 
50
50
 
51
51
  class ProcessMemoryPercent:
52
- """
53
- Process memory usage in percent.
54
- """
52
+ """Process memory usage in percent."""
55
53
 
56
54
  # name = "process_memory_percent"
57
55
  name = "proc.memory.percent"
@@ -79,9 +77,7 @@ class ProcessMemoryPercent:
79
77
 
80
78
 
81
79
  class MemoryPercent:
82
- """
83
- Total system memory usage in percent.
84
- """
80
+ """Total system memory usage in percent."""
85
81
 
86
82
  # name = "memory_percent"
87
83
  name = "memory"
@@ -104,9 +100,7 @@ class MemoryPercent:
104
100
 
105
101
 
106
102
  class MemoryAvailable:
107
- """
108
- Total system memory available in MB.
109
- """
103
+ """Total system memory available in MB."""
110
104
 
111
105
  # name = "memory_available"
112
106
  name = "proc.memory.availableMB"
@@ -159,11 +153,11 @@ class Memory:
159
153
 
160
154
  @classmethod
161
155
  def is_available(cls) -> bool:
162
- """Return a new instance of the CPU metrics"""
156
+ """Return a new instance of the CPU metrics."""
163
157
  return psutil is not None
164
158
 
165
159
  def probe(self) -> dict:
166
- """Return a dict of the hardware information"""
160
+ """Return a dict of the hardware information."""
167
161
  # total available memory in gigabytes
168
162
  return {
169
163
  "memory": {
@@ -18,9 +18,7 @@ if TYPE_CHECKING:
18
18
 
19
19
 
20
20
  class NetworkSent:
21
- """
22
- Network bytes sent.
23
- """
21
+ """Network bytes sent."""
24
22
 
25
23
  name = "network.sent"
26
24
  samples: "Deque[float]"
@@ -45,9 +43,7 @@ class NetworkSent:
45
43
 
46
44
 
47
45
  class NetworkRecv:
48
- """
49
- Network bytes received.
50
- """
46
+ """Network bytes received."""
51
47
 
52
48
  name = "network.recv"
53
49
  samples: "Deque[float]"
@@ -101,11 +97,11 @@ class Network:
101
97
 
102
98
  @classmethod
103
99
  def is_available(cls) -> bool:
104
- """Return a new instance of the CPU metrics"""
100
+ """Return a new instance of the CPU metrics."""
105
101
  return psutil is not None
106
102
 
107
103
  def probe(self) -> dict:
108
- """Return a dict of the hardware information"""
104
+ """Return a dict of the hardware information."""
109
105
  # net_if_addrs = psutil.net_if_addrs()
110
106
 
111
107
  # return {
@@ -0,0 +1,283 @@
1
+ import logging
2
+ import multiprocessing as mp
3
+ import re
4
+ import sys
5
+ from collections import defaultdict, deque
6
+ from functools import lru_cache
7
+ from hashlib import md5
8
+ from types import ModuleType
9
+ from typing import TYPE_CHECKING, Dict, List, Mapping, Tuple, Union
10
+
11
+ if sys.version_info >= (3, 8):
12
+ from typing import Final
13
+ else:
14
+ from typing_extensions import Final
15
+
16
+ import requests
17
+ import requests.adapters
18
+ import urllib3
19
+
20
+ import wandb
21
+ from wandb.sdk.lib import telemetry
22
+
23
+ from .aggregators import aggregate_last, aggregate_mean
24
+ from .interfaces import Interface, Metric, MetricsMonitor
25
+
26
+ if TYPE_CHECKING:
27
+ from typing import Deque, Optional
28
+
29
+ from wandb.sdk.internal.settings_static import SettingsStatic
30
+
31
+
32
+ _PREFIX: Final[str] = "openmetrics"
33
+
34
+ _REQUEST_RETRY_STRATEGY = urllib3.util.retry.Retry(
35
+ backoff_factor=1,
36
+ total=3,
37
+ status_forcelist=(408, 409, 429, 500, 502, 503, 504),
38
+ )
39
+ _REQUEST_POOL_CONNECTIONS = 4
40
+ _REQUEST_POOL_MAXSIZE = 4
41
+ _REQUEST_TIMEOUT = 3
42
+
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ prometheus_client_parser: "Optional[ModuleType]" = None
48
+ try:
49
+ import prometheus_client.parser # type: ignore
50
+
51
+ prometheus_client_parser = prometheus_client.parser
52
+ except ImportError:
53
+ pass
54
+
55
+
56
+ def _setup_requests_session() -> requests.Session:
57
+ session = requests.Session()
58
+ adapter = requests.adapters.HTTPAdapter(
59
+ max_retries=_REQUEST_RETRY_STRATEGY,
60
+ pool_connections=_REQUEST_POOL_CONNECTIONS,
61
+ pool_maxsize=_REQUEST_POOL_MAXSIZE,
62
+ )
63
+ session.mount("http://", adapter)
64
+ session.mount("https://", adapter)
65
+ return session
66
+
67
+
68
+ def _nested_dict_to_tuple(
69
+ nested_dict: Mapping[str, Mapping[str, str]]
70
+ ) -> Tuple[Tuple[str, Tuple[str, str]], ...]:
71
+ return tuple((k, *v.items()) for k, v in nested_dict.items()) # type: ignore
72
+
73
+
74
+ def _tuple_to_nested_dict(
75
+ nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...]
76
+ ) -> Dict[str, Dict[str, str]]:
77
+ return {k: dict(v) for k, *v in nested_tuple}
78
+
79
+
80
+ @lru_cache(maxsize=128)
81
+ def _should_capture_metric(
82
+ metric_name: str,
83
+ metric_labels: Tuple[str, ...],
84
+ filters: Tuple[Tuple[str, Tuple[str, str]], ...],
85
+ ) -> bool:
86
+ # we use tuples to make the function arguments hashable => usable with lru_cache
87
+ should_capture = False
88
+
89
+ if not filters:
90
+ return should_capture
91
+
92
+ # self.filters keys are regexes, check the name against them
93
+ # and for the first match, check the labels against the label filters.
94
+ # assume that if at least one label filter doesn't match, the metric
95
+ # should not be captured.
96
+ # it's up to the user to make sure that the filters are not conflicting etc.
97
+ metric_labels_dict = {t[0]: t[1] for t in metric_labels}
98
+ filters_dict = _tuple_to_nested_dict(filters)
99
+ for metric_name_regex, label_filters in filters_dict.items():
100
+ if not re.match(metric_name_regex, metric_name):
101
+ continue
102
+
103
+ should_capture = True
104
+
105
+ for label, label_filter in label_filters.items():
106
+ if not re.match(label_filter, metric_labels_dict.get(label, "")):
107
+ should_capture = False
108
+ break
109
+ break
110
+
111
+ return should_capture
112
+
113
+
114
+ class OpenMetricsMetric:
115
+ """Container for all the COUNTER and GAUGE metrics extracted from an OpenMetrics endpoint."""
116
+
117
+ def __init__(
118
+ self, name: str, url: str, filters: Mapping[str, Mapping[str, str]]
119
+ ) -> None:
120
+ self.name = name
121
+ self.url = url
122
+ self.filters = filters
123
+ self.filters_tuple = _nested_dict_to_tuple(filters)
124
+ self._session: Optional["requests.Session"] = None
125
+ self.samples: "Deque[dict]" = deque([])
126
+ # {"<metric name>": {"<labels hash>": <index>}}
127
+ self.label_map: "Dict[str, Dict[str, int]]" = defaultdict(dict)
128
+ # {"<labels hash>": <labels>}
129
+ self.label_hashes: "Dict[str, dict]" = {}
130
+
131
+ def setup(self) -> None:
132
+ if self._session is not None:
133
+ return
134
+
135
+ self._session = _setup_requests_session()
136
+
137
+ def teardown(self) -> None:
138
+ if self._session is None:
139
+ return
140
+
141
+ self._session.close()
142
+ self._session = None
143
+
144
+ def parse_open_metrics_endpoint(self) -> Dict[str, Union[str, int, float]]:
145
+ assert prometheus_client_parser is not None
146
+ assert self._session is not None
147
+
148
+ response = self._session.get(self.url, timeout=_REQUEST_TIMEOUT)
149
+ response.raise_for_status()
150
+
151
+ text = response.text
152
+ measurement = {}
153
+ for family in prometheus_client_parser.text_string_to_metric_families(text):
154
+ if family.type not in ("counter", "gauge"):
155
+ # todo: add support for other metric types?
156
+ # todo: log warning about that?
157
+ continue
158
+ for sample in family.samples:
159
+ name, labels, value = sample.name, sample.labels, sample.value
160
+
161
+ if not _should_capture_metric(
162
+ name,
163
+ tuple(labels.items()),
164
+ self.filters_tuple,
165
+ ):
166
+ continue
167
+
168
+ # md5 hash of the labels
169
+ label_hash = md5(str(labels).encode("utf-8")).hexdigest()
170
+ if label_hash not in self.label_map[name]:
171
+ # store the index of the label hash in the label map
172
+ self.label_map[name][label_hash] = len(self.label_map[name])
173
+ # store the labels themselves
174
+ self.label_hashes[label_hash] = labels
175
+ index = self.label_map[name][label_hash]
176
+ measurement[f"{name}.{index}"] = value
177
+
178
+ return measurement
179
+
180
+ def sample(self) -> None:
181
+ s = self.parse_open_metrics_endpoint()
182
+ self.samples.append(s)
183
+
184
+ def clear(self) -> None:
185
+ self.samples.clear()
186
+
187
+ def aggregate(self) -> dict:
188
+ if not self.samples:
189
+ return {}
190
+
191
+ prefix = f"{_PREFIX}.{self.name}."
192
+
193
+ stats = {}
194
+ for key in self.samples[0].keys():
195
+ samples = [s[key] for s in self.samples if key in s]
196
+ if samples and all(isinstance(s, (int, float)) for s in samples):
197
+ stats[f"{prefix}{key}"] = aggregate_mean(samples)
198
+ else:
199
+ stats[f"{prefix}{key}"] = aggregate_last(samples)
200
+ return stats
201
+
202
+
203
+ class OpenMetrics:
204
+ # Poll an OpenMetrics endpoint, parse the response and return a dict of metrics
205
+ # Implements the same Protocol interface as Asset
206
+
207
+ def __init__(
208
+ self,
209
+ interface: "Interface",
210
+ settings: "SettingsStatic",
211
+ shutdown_event: mp.synchronize.Event,
212
+ name: str,
213
+ url: str,
214
+ ) -> None:
215
+ self.name = name
216
+ self.url = url
217
+ self.interface = interface
218
+ self.settings = settings
219
+ self.shutdown_event = shutdown_event
220
+
221
+ self.metrics: List[Metric] = [
222
+ OpenMetricsMetric(name, url, settings._stats_open_metrics_filters)
223
+ ]
224
+
225
+ self.metrics_monitor: "MetricsMonitor" = MetricsMonitor(
226
+ asset_name=self.name,
227
+ metrics=self.metrics,
228
+ interface=interface,
229
+ settings=settings,
230
+ shutdown_event=shutdown_event,
231
+ )
232
+
233
+ telemetry_record = telemetry.TelemetryRecord()
234
+ telemetry_record.feature.open_metrics = True
235
+ interface._publish_telemetry(telemetry_record)
236
+
237
+ @classmethod
238
+ def is_available(cls, url: str) -> bool:
239
+ _is_available: bool = False
240
+
241
+ ret = prometheus_client_parser is not None
242
+ if not ret:
243
+ wandb.termwarn(
244
+ "Monitoring OpenMetrics endpoints requires the `prometheus_client` package. "
245
+ "To install it, run `pip install prometheus_client`.",
246
+ repeat=False,
247
+ )
248
+ return _is_available
249
+ # check if the endpoint is available and is a valid OpenMetrics endpoint
250
+ _session: Optional[requests.Session] = None
251
+ try:
252
+ assert prometheus_client_parser is not None
253
+ _session = _setup_requests_session()
254
+ response = _session.get(url, timeout=_REQUEST_TIMEOUT)
255
+ response.raise_for_status()
256
+
257
+ # check if the response is a valid OpenMetrics response
258
+ # text_string_to_metric_families returns a generator
259
+ if list(
260
+ prometheus_client_parser.text_string_to_metric_families(response.text)
261
+ ):
262
+ _is_available = True
263
+ except Exception as e:
264
+ logger.debug(
265
+ f"OpenMetrics endpoint {url} is not available: {e}", exc_info=True
266
+ )
267
+
268
+ if _session is not None:
269
+ try:
270
+ _session.close()
271
+ except Exception:
272
+ pass
273
+ return _is_available
274
+
275
+ def start(self) -> None:
276
+ self.metrics_monitor.start()
277
+
278
+ def finish(self) -> None:
279
+ self.metrics_monitor.finish()
280
+
281
+ def probe(self) -> dict:
282
+ # todo: also return self.label_hashes
283
+ return {self.name: self.url}
@@ -17,9 +17,7 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
19
  class TPUUtilization:
20
- """
21
- Google Cloud TPU utilization in percent.
22
- """
20
+ """Google Cloud TPU utilization in percent."""
23
21
 
24
22
  name = "tpu"
25
23
  samples: "Deque[float]"
@@ -130,7 +128,6 @@ class TPU:
130
128
 
131
129
  @classmethod
132
130
  def is_available(cls) -> bool:
133
-
134
131
  if os.environ.get("TPU_NAME", False) is False:
135
132
  return False
136
133