wandb 0.15.9__py3-none-any.whl → 0.15.11__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. wandb/__init__.py +5 -1
  2. wandb/apis/public.py +137 -17
  3. wandb/apis/reports/_panels.py +1 -1
  4. wandb/apis/reports/blocks.py +1 -0
  5. wandb/apis/reports/report.py +27 -5
  6. wandb/cli/cli.py +52 -41
  7. wandb/docker/__init__.py +17 -0
  8. wandb/docker/auth.py +1 -1
  9. wandb/env.py +24 -4
  10. wandb/filesync/step_checksum.py +3 -3
  11. wandb/integration/openai/openai.py +3 -0
  12. wandb/integration/ultralytics/__init__.py +9 -0
  13. wandb/integration/ultralytics/bbox_utils.py +196 -0
  14. wandb/integration/ultralytics/callback.py +458 -0
  15. wandb/integration/ultralytics/classification_utils.py +66 -0
  16. wandb/integration/ultralytics/mask_utils.py +141 -0
  17. wandb/integration/ultralytics/pose_utils.py +92 -0
  18. wandb/integration/xgboost/xgboost.py +3 -3
  19. wandb/integration/yolov8/__init__.py +0 -7
  20. wandb/integration/yolov8/yolov8.py +22 -3
  21. wandb/old/settings.py +7 -0
  22. wandb/plot/line_series.py +0 -1
  23. wandb/proto/v3/wandb_internal_pb2.py +353 -300
  24. wandb/proto/v3/wandb_server_pb2.py +37 -41
  25. wandb/proto/v3/wandb_settings_pb2.py +2 -2
  26. wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
  27. wandb/proto/v4/wandb_internal_pb2.py +272 -260
  28. wandb/proto/v4/wandb_server_pb2.py +37 -40
  29. wandb/proto/v4/wandb_settings_pb2.py +2 -2
  30. wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
  31. wandb/proto/wandb_internal_codegen.py +7 -31
  32. wandb/sdk/artifacts/artifact.py +321 -189
  33. wandb/sdk/artifacts/artifact_cache.py +14 -0
  34. wandb/sdk/artifacts/artifact_manifest.py +5 -4
  35. wandb/sdk/artifacts/artifact_manifest_entry.py +37 -9
  36. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -9
  37. wandb/sdk/artifacts/artifact_saver.py +13 -50
  38. wandb/sdk/artifacts/artifact_ttl.py +6 -0
  39. wandb/sdk/artifacts/artifacts_cache.py +119 -93
  40. wandb/sdk/artifacts/staging.py +25 -0
  41. wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
  42. wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -3
  43. wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
  44. wandb/sdk/artifacts/storage_policies/register.py +1 -0
  45. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +4 -3
  46. wandb/sdk/artifacts/storage_policy.py +4 -2
  47. wandb/sdk/backend/backend.py +0 -16
  48. wandb/sdk/data_types/image.py +3 -1
  49. wandb/sdk/integration_utils/auto_logging.py +38 -13
  50. wandb/sdk/interface/interface.py +16 -135
  51. wandb/sdk/interface/interface_shared.py +9 -147
  52. wandb/sdk/interface/interface_sock.py +0 -26
  53. wandb/sdk/internal/file_pusher.py +20 -3
  54. wandb/sdk/internal/file_stream.py +3 -1
  55. wandb/sdk/internal/handler.py +53 -70
  56. wandb/sdk/internal/internal_api.py +220 -130
  57. wandb/sdk/internal/job_builder.py +41 -37
  58. wandb/sdk/internal/sender.py +7 -25
  59. wandb/sdk/internal/system/assets/disk.py +144 -11
  60. wandb/sdk/internal/system/system_info.py +6 -2
  61. wandb/sdk/launch/__init__.py +5 -0
  62. wandb/sdk/launch/{launch.py → _launch.py} +53 -54
  63. wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
  64. wandb/sdk/launch/_project_spec.py +13 -2
  65. wandb/sdk/launch/agent/agent.py +103 -59
  66. wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
  67. wandb/sdk/launch/builder/build.py +19 -1
  68. wandb/sdk/launch/builder/docker_builder.py +5 -1
  69. wandb/sdk/launch/builder/kaniko_builder.py +5 -1
  70. wandb/sdk/launch/create_job.py +20 -5
  71. wandb/sdk/launch/loader.py +14 -5
  72. wandb/sdk/launch/runner/abstract.py +0 -2
  73. wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
  74. wandb/sdk/launch/runner/kubernetes_runner.py +66 -209
  75. wandb/sdk/launch/runner/local_container.py +5 -2
  76. wandb/sdk/launch/runner/local_process.py +4 -1
  77. wandb/sdk/launch/sweeps/scheduler.py +43 -25
  78. wandb/sdk/launch/sweeps/utils.py +5 -3
  79. wandb/sdk/launch/utils.py +3 -1
  80. wandb/sdk/lib/_settings_toposort_generate.py +3 -9
  81. wandb/sdk/lib/_settings_toposort_generated.py +27 -3
  82. wandb/sdk/lib/_wburls_generated.py +1 -0
  83. wandb/sdk/lib/filenames.py +27 -6
  84. wandb/sdk/lib/filesystem.py +181 -7
  85. wandb/sdk/lib/fsm.py +5 -3
  86. wandb/sdk/lib/gql_request.py +3 -0
  87. wandb/sdk/lib/ipython.py +7 -0
  88. wandb/sdk/lib/wburls.py +1 -0
  89. wandb/sdk/service/port_file.py +2 -15
  90. wandb/sdk/service/server.py +7 -55
  91. wandb/sdk/service/service.py +56 -26
  92. wandb/sdk/service/service_base.py +1 -1
  93. wandb/sdk/service/streams.py +11 -5
  94. wandb/sdk/verify/verify.py +2 -2
  95. wandb/sdk/wandb_init.py +8 -2
  96. wandb/sdk/wandb_manager.py +4 -14
  97. wandb/sdk/wandb_run.py +143 -53
  98. wandb/sdk/wandb_settings.py +148 -35
  99. wandb/testing/relay.py +85 -38
  100. wandb/util.py +87 -4
  101. wandb/wandb_torch.py +24 -38
  102. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/METADATA +48 -23
  103. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/RECORD +107 -103
  104. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/WHEEL +1 -1
  105. wandb/proto/v3/wandb_server_pb2_grpc.py +0 -1422
  106. wandb/proto/v4/wandb_server_pb2_grpc.py +0 -1422
  107. wandb/proto/wandb_server_pb2_grpc.py +0 -8
  108. wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +0 -61
  109. wandb/sdk/interface/interface_grpc.py +0 -460
  110. wandb/sdk/service/server_grpc.py +0 -444
  111. wandb/sdk/service/service_grpc.py +0 -73
  112. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
  113. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
  114. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,329 @@
1
+ import logging
2
+ from threading import Lock, Thread
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import urllib3
6
+ from kubernetes import watch # type: ignore # noqa: F401
7
+ from kubernetes.client import ( # type: ignore # noqa: F401
8
+ ApiException,
9
+ BatchV1Api,
10
+ CoreV1Api,
11
+ CustomObjectsApi,
12
+ V1PodStatus,
13
+ )
14
+
15
+ import wandb
16
+
17
+ from .abstract import State, Status
18
+
19
+ # Dict for mapping possible states of custom objects to the states we want to report
20
+ # to the agent.
21
+ CRD_STATE_DICT: Dict[str, State] = {
22
+ # Starting states.
23
+ "created": "starting",
24
+ "pending": "starting",
25
+ # Running states.
26
+ "running": "running",
27
+ "completing": "running",
28
+ # Finished states.
29
+ "succeeded": "finished",
30
+ "completed": "finished",
31
+ # Failed states.
32
+ "failed": "failed",
33
+ "aborted": "failed",
34
+ "timeout": "failed",
35
+ "terminated": "failed",
36
+ # Stopping states.
37
+ "terminating": "stopping",
38
+ }
39
+
40
+
41
+ _logger = logging.getLogger(__name__)
42
+
43
+
44
+ class SafeWatch:
45
+ """Wrapper for the kubernetes watch class that can recover in more situations."""
46
+
47
+ def __init__(self, watcher: "watch.Watch") -> None:
48
+ """Initialize the SafeWatch."""
49
+ self._watcher = watcher
50
+ self._last_seen_resource_version: Optional[str] = None
51
+ self._stopped = False
52
+
53
+ def stream(self, func: Any, *args: Any, **kwargs: Any) -> Any:
54
+ """Stream the watcher."""
55
+ while True:
56
+ try:
57
+ for event in self._watcher.stream(
58
+ func, *args, **kwargs, timeout_seconds=15
59
+ ):
60
+ if self._stopped:
61
+ break
62
+ # Save the resource version so that we can resume the stream
63
+ # if it breaks.
64
+ object = event.get("object")
65
+ if isinstance(object, dict):
66
+ self._last_seen_resource_version = object.get(
67
+ "metadata", dict()
68
+ ).get("resourceVersion")
69
+ else:
70
+ self._last_seen_resource_version = (
71
+ object.metadata.resource_version
72
+ )
73
+ kwargs["resource_version"] = self._last_seen_resource_version
74
+ yield event
75
+ # If stream ends after stop just break
76
+ if self._stopped:
77
+ break
78
+ except urllib3.exceptions.ProtocolError as e:
79
+ wandb.termwarn(f"Broken event stream: {e}")
80
+ except ApiException as e:
81
+ if e.status == 410:
82
+ # If resource version is too old we need to start over.
83
+ del kwargs["resource_version"]
84
+ self._last_seen_resource_version = None
85
+ except Exception as E:
86
+ wandb.termerror(f"Unknown exception in event stream: {E}")
87
+
88
+ def stop(self) -> None:
89
+ """Stop the watcher."""
90
+ self._watcher.stop()
91
+ self._stopped = True
92
+
93
+
94
+ def _is_preempted(status: "V1PodStatus") -> bool:
95
+ """Check if this pod has been preempted."""
96
+ if hasattr(status, "conditions") and status.conditions is not None:
97
+ for condition in status.conditions:
98
+ if condition.type == "DisruptionTarget" and condition.reason in [
99
+ "EvictionByEvictionAPI",
100
+ "PreemptionByScheduler",
101
+ "TerminationByKubelet",
102
+ ]:
103
+ return True
104
+ return False
105
+
106
+
107
+ def _is_container_creating(status: "V1PodStatus") -> bool:
108
+ """Check if this pod has started creating containers."""
109
+ for container_status in status.container_statuses or []:
110
+ if (
111
+ container_status.state
112
+ and container_status.state.waiting
113
+ and container_status.state.waiting.reason == "ContainerCreating"
114
+ ):
115
+ return True
116
+ return False
117
+
118
+
119
+ def _state_from_conditions(conditions: List[Dict[str, Any]]) -> Optional[str]:
120
+ """Get the status from the pod conditions."""
121
+ true_conditions = [
122
+ c.get("type", "").lower() for c in conditions if c.get("status") == "True"
123
+ ]
124
+ detected_states = {
125
+ CRD_STATE_DICT[c] for c in true_conditions if c in CRD_STATE_DICT
126
+ }
127
+ for state in ["finished", "failed", "stopping", "running", "starting"]:
128
+ if state in detected_states:
129
+ return state
130
+ return None
131
+
132
+
133
+ class KubernetesRunMonitor:
134
+ def __init__(
135
+ self,
136
+ job_field_selector: str,
137
+ pod_label_selector: str,
138
+ namespace: str,
139
+ batch_api: "BatchV1Api",
140
+ core_api: "CoreV1Api",
141
+ custom_api: "CustomObjectsApi" = None,
142
+ group: Optional[str] = None,
143
+ version: Optional[str] = None,
144
+ plural: Optional[str] = None,
145
+ ) -> None:
146
+ """Initialize KubernetesRunMonitor.
147
+
148
+ If a custom api is provided, the group, version, and plural arguments must also
149
+ be provided. These are used to query the custom api for a launched custom
150
+ object (CRD). Group, version, and plural in this context refer to the
151
+ Kubernetes API group, version, and plural for the CRD. For more information
152
+ see: https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/
153
+
154
+ The run monitor starts two threads to watch for pods and jobs/crds matching the
155
+ provided selectors. The status is set to "starting" when the run monitor is
156
+ initialized. The status is set to "running" when a pod matching the pod selector
157
+ is found with a status of "Running" or has a container with a status of
158
+ "ContainerCreating". The status is set to "finished" when a job matching the job
159
+ selector is found with a status of "Succeeded". The status is set to "failed"
160
+ when a job matching the job selector is found with a status of "Failed" or a pod
161
+ matching the pod selector is found with a status of "Failed". The status is set
162
+ to "preempted" when a pod matching the pod selector is found with a condition
163
+ type of "DisruptionTarget" and a reason of "EvictionByEvictionAPI",
164
+ "PreemptionByScheduler", or "TerminationByKubelet".
165
+
166
+ The logic for the CRD is similar to the logic for the job, but we inspect
167
+ both the phase of the CRD and the conditions since some CRDs do not have a
168
+ phase field.
169
+
170
+ Arguments:
171
+ job_field_selector: The field selector for the job or crd.
172
+ pod_label_selector: The label selector for the pods.
173
+ namespace: The namespace to monitor.
174
+ batch_api: The batch api client.
175
+ core_api: The core api client.
176
+ custom_api: The custom api client.
177
+ group: The group of the CRD.
178
+ version: The version of the CRD.
179
+ plural: The plural of the CRD.
180
+
181
+ Returns:
182
+ None.
183
+ """
184
+ self.pod_label_selector = pod_label_selector
185
+ self.job_field_selector = job_field_selector
186
+ self.namespace = namespace
187
+ self.batch_api = batch_api
188
+ self.core_api = core_api
189
+ self.custom_api = custom_api
190
+ self.group = group
191
+ self.version = version
192
+ self.plural = plural
193
+
194
+ self._status_lock = Lock()
195
+ self._status = Status("starting")
196
+
197
+ # Only one of the job or crd watchers will be used.
198
+ self._watch_job_thread = Thread(target=self._watch_job, daemon=True)
199
+ self._watch_crd_thread = Thread(target=self._watch_crd, daemon=True)
200
+
201
+ self._watch_pods_thread = Thread(target=self._watch_pods, daemon=True)
202
+
203
+ self._job_watcher = SafeWatch(watch.Watch())
204
+ self._pod_watcher = SafeWatch(watch.Watch())
205
+
206
+ def start(self) -> None:
207
+ """Start the run monitor."""
208
+ if self.custom_api is None:
209
+ self._watch_job_thread.start()
210
+ else:
211
+ self._watch_crd_thread.start()
212
+ self._watch_pods_thread.start()
213
+
214
+ def stop(self) -> None:
215
+ """Stop the run monitor."""
216
+ self._job_watcher.stop()
217
+ self._pod_watcher.stop()
218
+
219
+ def _set_status(self, status: Status) -> None:
220
+ """Set the run status."""
221
+ with self._status_lock:
222
+ self._status = status
223
+
224
+ def get_status(self) -> Status:
225
+ """Get the run status."""
226
+ with self._status_lock:
227
+ # Each time this is called we verify that our watchers are active.
228
+ if self._status.state in ["running", "starting"]:
229
+ if self.custom_api is None:
230
+ if not self._watch_job_thread.is_alive():
231
+ wandb.termwarn(
232
+ f"Job watcher thread is dead for {self.job_field_selector}"
233
+ )
234
+ self._watch_job_thread = Thread(
235
+ target=self._watch_job, daemon=True
236
+ )
237
+ self._watch_job_thread.start()
238
+ else:
239
+ if not self._watch_crd_thread.is_alive():
240
+ wandb.termwarn(
241
+ f"CRD watcher thread is dead for {self.job_field_selector}"
242
+ )
243
+ self._watch_crd_thread = Thread(
244
+ target=self._watch_crd, daemon=True
245
+ )
246
+ self._watch_crd_thread.start()
247
+ if not self._watch_pods_thread.is_alive():
248
+ wandb.termwarn(
249
+ f"Pod watcher thread is dead for {self.pod_label_selector}"
250
+ )
251
+ self._watch_pods_thread = Thread(
252
+ target=self._watch_pods, daemon=True
253
+ )
254
+ self._watch_pods_thread.start()
255
+ return self._status
256
+
257
+ def _watch_pods(self) -> None:
258
+ """Watch for pods created matching the jobname."""
259
+ # Stream with no timeout polling for pod status updates
260
+ for event in self._pod_watcher.stream(
261
+ self.core_api.list_namespaced_pod,
262
+ namespace=self.namespace,
263
+ label_selector=self.pod_label_selector,
264
+ ):
265
+ object = event.get("object")
266
+ # Sometimes ADDED events will be missing field.
267
+ if not hasattr(object, "status"):
268
+ continue
269
+ if object.status.phase == "Running":
270
+ self._set_status(Status("running"))
271
+ if _is_preempted(object.status):
272
+ self._set_status(Status("preempted"))
273
+ self.stop()
274
+ break
275
+ if _is_container_creating(object.status):
276
+ self._set_status(Status("running"))
277
+
278
+ def _watch_job(self) -> None:
279
+ """Watch for job matching the jobname."""
280
+ for event in self._job_watcher.stream(
281
+ self.batch_api.list_namespaced_job,
282
+ namespace=self.namespace,
283
+ field_selector=self.job_field_selector,
284
+ ):
285
+ object = event.get("object")
286
+ if object.status.succeeded == 1:
287
+ self._set_status(Status("finished"))
288
+ self.stop()
289
+ break
290
+ elif object.status.failed is not None and object.status.failed >= 1:
291
+ self._set_status(Status("failed"))
292
+ self.stop()
293
+ break
294
+
295
+ def _watch_crd(self) -> None:
296
+ """Watch for CRD matching the jobname."""
297
+ for event in self._job_watcher.stream(
298
+ self.custom_api.list_namespaced_custom_object,
299
+ namespace=self.namespace,
300
+ field_selector=self.job_field_selector,
301
+ group=self.group,
302
+ version=self.version,
303
+ plural=self.plural,
304
+ ):
305
+ object = event.get("object")
306
+ status = object.get("status")
307
+ if status is None:
308
+ continue
309
+ state = status.get("state")
310
+ if isinstance(state, dict):
311
+ raw_state = state.get("phase", "")
312
+ state = CRD_STATE_DICT.get(raw_state)
313
+ else:
314
+ conditions = status.get("conditions")
315
+ if isinstance(conditions, list):
316
+ state = _state_from_conditions(conditions)
317
+ else:
318
+ # This should never happen.
319
+ _logger.warning(
320
+ f"Unexpected conditions type {type(conditions)} "
321
+ f"for CRD {self.job_field_selector}: {conditions}"
322
+ )
323
+ if state is None:
324
+ continue
325
+ status = Status(state)
326
+ self._set_status(status)
327
+ if status.state in ["finished", "failed", "preempted"]:
328
+ self.stop()
329
+ break