wandb 0.15.9__py3-none-any.whl → 0.15.11__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +5 -1
- wandb/apis/public.py +137 -17
- wandb/apis/reports/_panels.py +1 -1
- wandb/apis/reports/blocks.py +1 -0
- wandb/apis/reports/report.py +27 -5
- wandb/cli/cli.py +52 -41
- wandb/docker/__init__.py +17 -0
- wandb/docker/auth.py +1 -1
- wandb/env.py +24 -4
- wandb/filesync/step_checksum.py +3 -3
- wandb/integration/openai/openai.py +3 -0
- wandb/integration/ultralytics/__init__.py +9 -0
- wandb/integration/ultralytics/bbox_utils.py +196 -0
- wandb/integration/ultralytics/callback.py +458 -0
- wandb/integration/ultralytics/classification_utils.py +66 -0
- wandb/integration/ultralytics/mask_utils.py +141 -0
- wandb/integration/ultralytics/pose_utils.py +92 -0
- wandb/integration/xgboost/xgboost.py +3 -3
- wandb/integration/yolov8/__init__.py +0 -7
- wandb/integration/yolov8/yolov8.py +22 -3
- wandb/old/settings.py +7 -0
- wandb/plot/line_series.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +353 -300
- wandb/proto/v3/wandb_server_pb2.py +37 -41
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
- wandb/proto/v4/wandb_internal_pb2.py +272 -260
- wandb/proto/v4/wandb_server_pb2.py +37 -40
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
- wandb/proto/wandb_internal_codegen.py +7 -31
- wandb/sdk/artifacts/artifact.py +321 -189
- wandb/sdk/artifacts/artifact_cache.py +14 -0
- wandb/sdk/artifacts/artifact_manifest.py +5 -4
- wandb/sdk/artifacts/artifact_manifest_entry.py +37 -9
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -9
- wandb/sdk/artifacts/artifact_saver.py +13 -50
- wandb/sdk/artifacts/artifact_ttl.py +6 -0
- wandb/sdk/artifacts/artifacts_cache.py +119 -93
- wandb/sdk/artifacts/staging.py +25 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -3
- wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
- wandb/sdk/artifacts/storage_policies/register.py +1 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +4 -3
- wandb/sdk/artifacts/storage_policy.py +4 -2
- wandb/sdk/backend/backend.py +0 -16
- wandb/sdk/data_types/image.py +3 -1
- wandb/sdk/integration_utils/auto_logging.py +38 -13
- wandb/sdk/interface/interface.py +16 -135
- wandb/sdk/interface/interface_shared.py +9 -147
- wandb/sdk/interface/interface_sock.py +0 -26
- wandb/sdk/internal/file_pusher.py +20 -3
- wandb/sdk/internal/file_stream.py +3 -1
- wandb/sdk/internal/handler.py +53 -70
- wandb/sdk/internal/internal_api.py +220 -130
- wandb/sdk/internal/job_builder.py +41 -37
- wandb/sdk/internal/sender.py +7 -25
- wandb/sdk/internal/system/assets/disk.py +144 -11
- wandb/sdk/internal/system/system_info.py +6 -2
- wandb/sdk/launch/__init__.py +5 -0
- wandb/sdk/launch/{launch.py → _launch.py} +53 -54
- wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
- wandb/sdk/launch/_project_spec.py +13 -2
- wandb/sdk/launch/agent/agent.py +103 -59
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
- wandb/sdk/launch/builder/build.py +19 -1
- wandb/sdk/launch/builder/docker_builder.py +5 -1
- wandb/sdk/launch/builder/kaniko_builder.py +5 -1
- wandb/sdk/launch/create_job.py +20 -5
- wandb/sdk/launch/loader.py +14 -5
- wandb/sdk/launch/runner/abstract.py +0 -2
- wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +66 -209
- wandb/sdk/launch/runner/local_container.py +5 -2
- wandb/sdk/launch/runner/local_process.py +4 -1
- wandb/sdk/launch/sweeps/scheduler.py +43 -25
- wandb/sdk/launch/sweeps/utils.py +5 -3
- wandb/sdk/launch/utils.py +3 -1
- wandb/sdk/lib/_settings_toposort_generate.py +3 -9
- wandb/sdk/lib/_settings_toposort_generated.py +27 -3
- wandb/sdk/lib/_wburls_generated.py +1 -0
- wandb/sdk/lib/filenames.py +27 -6
- wandb/sdk/lib/filesystem.py +181 -7
- wandb/sdk/lib/fsm.py +5 -3
- wandb/sdk/lib/gql_request.py +3 -0
- wandb/sdk/lib/ipython.py +7 -0
- wandb/sdk/lib/wburls.py +1 -0
- wandb/sdk/service/port_file.py +2 -15
- wandb/sdk/service/server.py +7 -55
- wandb/sdk/service/service.py +56 -26
- wandb/sdk/service/service_base.py +1 -1
- wandb/sdk/service/streams.py +11 -5
- wandb/sdk/verify/verify.py +2 -2
- wandb/sdk/wandb_init.py +8 -2
- wandb/sdk/wandb_manager.py +4 -14
- wandb/sdk/wandb_run.py +143 -53
- wandb/sdk/wandb_settings.py +148 -35
- wandb/testing/relay.py +85 -38
- wandb/util.py +87 -4
- wandb/wandb_torch.py +24 -38
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/METADATA +48 -23
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/RECORD +107 -103
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/WHEEL +1 -1
- wandb/proto/v3/wandb_server_pb2_grpc.py +0 -1422
- wandb/proto/v4/wandb_server_pb2_grpc.py +0 -1422
- wandb/proto/wandb_server_pb2_grpc.py +0 -8
- wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +0 -61
- wandb/sdk/interface/interface_grpc.py +0 -460
- wandb/sdk/service/server_grpc.py +0 -444
- wandb/sdk/service/service_grpc.py +0 -73
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,329 @@
|
|
1
|
+
import logging
|
2
|
+
from threading import Lock, Thread
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
import urllib3
|
6
|
+
from kubernetes import watch # type: ignore # noqa: F401
|
7
|
+
from kubernetes.client import ( # type: ignore # noqa: F401
|
8
|
+
ApiException,
|
9
|
+
BatchV1Api,
|
10
|
+
CoreV1Api,
|
11
|
+
CustomObjectsApi,
|
12
|
+
V1PodStatus,
|
13
|
+
)
|
14
|
+
|
15
|
+
import wandb
|
16
|
+
|
17
|
+
from .abstract import State, Status
|
18
|
+
|
19
|
+
# Dict for mapping possible states of custom objects to the states we want to report
|
20
|
+
# to the agent.
|
21
|
+
CRD_STATE_DICT: Dict[str, State] = {
|
22
|
+
# Starting states.
|
23
|
+
"created": "starting",
|
24
|
+
"pending": "starting",
|
25
|
+
# Running states.
|
26
|
+
"running": "running",
|
27
|
+
"completing": "running",
|
28
|
+
# Finished states.
|
29
|
+
"succeeded": "finished",
|
30
|
+
"completed": "finished",
|
31
|
+
# Failed states.
|
32
|
+
"failed": "failed",
|
33
|
+
"aborted": "failed",
|
34
|
+
"timeout": "failed",
|
35
|
+
"terminated": "failed",
|
36
|
+
# Stopping states.
|
37
|
+
"terminating": "stopping",
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
_logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
|
44
|
+
class SafeWatch:
|
45
|
+
"""Wrapper for the kubernetes watch class that can recover in more situations."""
|
46
|
+
|
47
|
+
def __init__(self, watcher: "watch.Watch") -> None:
|
48
|
+
"""Initialize the SafeWatch."""
|
49
|
+
self._watcher = watcher
|
50
|
+
self._last_seen_resource_version: Optional[str] = None
|
51
|
+
self._stopped = False
|
52
|
+
|
53
|
+
def stream(self, func: Any, *args: Any, **kwargs: Any) -> Any:
|
54
|
+
"""Stream the watcher."""
|
55
|
+
while True:
|
56
|
+
try:
|
57
|
+
for event in self._watcher.stream(
|
58
|
+
func, *args, **kwargs, timeout_seconds=15
|
59
|
+
):
|
60
|
+
if self._stopped:
|
61
|
+
break
|
62
|
+
# Save the resource version so that we can resume the stream
|
63
|
+
# if it breaks.
|
64
|
+
object = event.get("object")
|
65
|
+
if isinstance(object, dict):
|
66
|
+
self._last_seen_resource_version = object.get(
|
67
|
+
"metadata", dict()
|
68
|
+
).get("resourceVersion")
|
69
|
+
else:
|
70
|
+
self._last_seen_resource_version = (
|
71
|
+
object.metadata.resource_version
|
72
|
+
)
|
73
|
+
kwargs["resource_version"] = self._last_seen_resource_version
|
74
|
+
yield event
|
75
|
+
# If stream ends after stop just break
|
76
|
+
if self._stopped:
|
77
|
+
break
|
78
|
+
except urllib3.exceptions.ProtocolError as e:
|
79
|
+
wandb.termwarn(f"Broken event stream: {e}")
|
80
|
+
except ApiException as e:
|
81
|
+
if e.status == 410:
|
82
|
+
# If resource version is too old we need to start over.
|
83
|
+
del kwargs["resource_version"]
|
84
|
+
self._last_seen_resource_version = None
|
85
|
+
except Exception as E:
|
86
|
+
wandb.termerror(f"Unknown exception in event stream: {E}")
|
87
|
+
|
88
|
+
def stop(self) -> None:
|
89
|
+
"""Stop the watcher."""
|
90
|
+
self._watcher.stop()
|
91
|
+
self._stopped = True
|
92
|
+
|
93
|
+
|
94
|
+
def _is_preempted(status: "V1PodStatus") -> bool:
|
95
|
+
"""Check if this pod has been preempted."""
|
96
|
+
if hasattr(status, "conditions") and status.conditions is not None:
|
97
|
+
for condition in status.conditions:
|
98
|
+
if condition.type == "DisruptionTarget" and condition.reason in [
|
99
|
+
"EvictionByEvictionAPI",
|
100
|
+
"PreemptionByScheduler",
|
101
|
+
"TerminationByKubelet",
|
102
|
+
]:
|
103
|
+
return True
|
104
|
+
return False
|
105
|
+
|
106
|
+
|
107
|
+
def _is_container_creating(status: "V1PodStatus") -> bool:
|
108
|
+
"""Check if this pod has started creating containers."""
|
109
|
+
for container_status in status.container_statuses or []:
|
110
|
+
if (
|
111
|
+
container_status.state
|
112
|
+
and container_status.state.waiting
|
113
|
+
and container_status.state.waiting.reason == "ContainerCreating"
|
114
|
+
):
|
115
|
+
return True
|
116
|
+
return False
|
117
|
+
|
118
|
+
|
119
|
+
def _state_from_conditions(conditions: List[Dict[str, Any]]) -> Optional[str]:
|
120
|
+
"""Get the status from the pod conditions."""
|
121
|
+
true_conditions = [
|
122
|
+
c.get("type", "").lower() for c in conditions if c.get("status") == "True"
|
123
|
+
]
|
124
|
+
detected_states = {
|
125
|
+
CRD_STATE_DICT[c] for c in true_conditions if c in CRD_STATE_DICT
|
126
|
+
}
|
127
|
+
for state in ["finished", "failed", "stopping", "running", "starting"]:
|
128
|
+
if state in detected_states:
|
129
|
+
return state
|
130
|
+
return None
|
131
|
+
|
132
|
+
|
133
|
+
class KubernetesRunMonitor:
|
134
|
+
def __init__(
|
135
|
+
self,
|
136
|
+
job_field_selector: str,
|
137
|
+
pod_label_selector: str,
|
138
|
+
namespace: str,
|
139
|
+
batch_api: "BatchV1Api",
|
140
|
+
core_api: "CoreV1Api",
|
141
|
+
custom_api: "CustomObjectsApi" = None,
|
142
|
+
group: Optional[str] = None,
|
143
|
+
version: Optional[str] = None,
|
144
|
+
plural: Optional[str] = None,
|
145
|
+
) -> None:
|
146
|
+
"""Initialize KubernetesRunMonitor.
|
147
|
+
|
148
|
+
If a custom api is provided, the group, version, and plural arguments must also
|
149
|
+
be provided. These are used to query the custom api for a launched custom
|
150
|
+
object (CRD). Group, version, and plural in this context refer to the
|
151
|
+
Kubernetes API group, version, and plural for the CRD. For more information
|
152
|
+
see: https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/
|
153
|
+
|
154
|
+
The run monitor starts two threads to watch for pods and jobs/crds matching the
|
155
|
+
provided selectors. The status is set to "starting" when the run monitor is
|
156
|
+
initialized. The status is set to "running" when a pod matching the pod selector
|
157
|
+
is found with a status of "Running" or has a container with a status of
|
158
|
+
"ContainerCreating". The status is set to "finished" when a job matching the job
|
159
|
+
selector is found with a status of "Succeeded". The status is set to "failed"
|
160
|
+
when a job matching the job selector is found with a status of "Failed" or a pod
|
161
|
+
matching the pod selector is found with a status of "Failed". The status is set
|
162
|
+
to "preempted" when a pod matching the pod selector is found with a condition
|
163
|
+
type of "DisruptionTarget" and a reason of "EvictionByEvictionAPI",
|
164
|
+
"PreemptionByScheduler", or "TerminationByKubelet".
|
165
|
+
|
166
|
+
The logic for the CRD is similar to the logic for the job, but we inspect
|
167
|
+
both the phase of the CRD and the conditions since some CRDs do not have a
|
168
|
+
phase field.
|
169
|
+
|
170
|
+
Arguments:
|
171
|
+
job_field_selector: The field selector for the job or crd.
|
172
|
+
pod_label_selector: The label selector for the pods.
|
173
|
+
namespace: The namespace to monitor.
|
174
|
+
batch_api: The batch api client.
|
175
|
+
core_api: The core api client.
|
176
|
+
custom_api: The custom api client.
|
177
|
+
group: The group of the CRD.
|
178
|
+
version: The version of the CRD.
|
179
|
+
plural: The plural of the CRD.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
None.
|
183
|
+
"""
|
184
|
+
self.pod_label_selector = pod_label_selector
|
185
|
+
self.job_field_selector = job_field_selector
|
186
|
+
self.namespace = namespace
|
187
|
+
self.batch_api = batch_api
|
188
|
+
self.core_api = core_api
|
189
|
+
self.custom_api = custom_api
|
190
|
+
self.group = group
|
191
|
+
self.version = version
|
192
|
+
self.plural = plural
|
193
|
+
|
194
|
+
self._status_lock = Lock()
|
195
|
+
self._status = Status("starting")
|
196
|
+
|
197
|
+
# Only one of the job or crd watchers will be used.
|
198
|
+
self._watch_job_thread = Thread(target=self._watch_job, daemon=True)
|
199
|
+
self._watch_crd_thread = Thread(target=self._watch_crd, daemon=True)
|
200
|
+
|
201
|
+
self._watch_pods_thread = Thread(target=self._watch_pods, daemon=True)
|
202
|
+
|
203
|
+
self._job_watcher = SafeWatch(watch.Watch())
|
204
|
+
self._pod_watcher = SafeWatch(watch.Watch())
|
205
|
+
|
206
|
+
def start(self) -> None:
|
207
|
+
"""Start the run monitor."""
|
208
|
+
if self.custom_api is None:
|
209
|
+
self._watch_job_thread.start()
|
210
|
+
else:
|
211
|
+
self._watch_crd_thread.start()
|
212
|
+
self._watch_pods_thread.start()
|
213
|
+
|
214
|
+
def stop(self) -> None:
|
215
|
+
"""Stop the run monitor."""
|
216
|
+
self._job_watcher.stop()
|
217
|
+
self._pod_watcher.stop()
|
218
|
+
|
219
|
+
def _set_status(self, status: Status) -> None:
|
220
|
+
"""Set the run status."""
|
221
|
+
with self._status_lock:
|
222
|
+
self._status = status
|
223
|
+
|
224
|
+
def get_status(self) -> Status:
|
225
|
+
"""Get the run status."""
|
226
|
+
with self._status_lock:
|
227
|
+
# Each time this is called we verify that our watchers are active.
|
228
|
+
if self._status.state in ["running", "starting"]:
|
229
|
+
if self.custom_api is None:
|
230
|
+
if not self._watch_job_thread.is_alive():
|
231
|
+
wandb.termwarn(
|
232
|
+
f"Job watcher thread is dead for {self.job_field_selector}"
|
233
|
+
)
|
234
|
+
self._watch_job_thread = Thread(
|
235
|
+
target=self._watch_job, daemon=True
|
236
|
+
)
|
237
|
+
self._watch_job_thread.start()
|
238
|
+
else:
|
239
|
+
if not self._watch_crd_thread.is_alive():
|
240
|
+
wandb.termwarn(
|
241
|
+
f"CRD watcher thread is dead for {self.job_field_selector}"
|
242
|
+
)
|
243
|
+
self._watch_crd_thread = Thread(
|
244
|
+
target=self._watch_crd, daemon=True
|
245
|
+
)
|
246
|
+
self._watch_crd_thread.start()
|
247
|
+
if not self._watch_pods_thread.is_alive():
|
248
|
+
wandb.termwarn(
|
249
|
+
f"Pod watcher thread is dead for {self.pod_label_selector}"
|
250
|
+
)
|
251
|
+
self._watch_pods_thread = Thread(
|
252
|
+
target=self._watch_pods, daemon=True
|
253
|
+
)
|
254
|
+
self._watch_pods_thread.start()
|
255
|
+
return self._status
|
256
|
+
|
257
|
+
def _watch_pods(self) -> None:
|
258
|
+
"""Watch for pods created matching the jobname."""
|
259
|
+
# Stream with no timeout polling for pod status updates
|
260
|
+
for event in self._pod_watcher.stream(
|
261
|
+
self.core_api.list_namespaced_pod,
|
262
|
+
namespace=self.namespace,
|
263
|
+
label_selector=self.pod_label_selector,
|
264
|
+
):
|
265
|
+
object = event.get("object")
|
266
|
+
# Sometimes ADDED events will be missing field.
|
267
|
+
if not hasattr(object, "status"):
|
268
|
+
continue
|
269
|
+
if object.status.phase == "Running":
|
270
|
+
self._set_status(Status("running"))
|
271
|
+
if _is_preempted(object.status):
|
272
|
+
self._set_status(Status("preempted"))
|
273
|
+
self.stop()
|
274
|
+
break
|
275
|
+
if _is_container_creating(object.status):
|
276
|
+
self._set_status(Status("running"))
|
277
|
+
|
278
|
+
def _watch_job(self) -> None:
|
279
|
+
"""Watch for job matching the jobname."""
|
280
|
+
for event in self._job_watcher.stream(
|
281
|
+
self.batch_api.list_namespaced_job,
|
282
|
+
namespace=self.namespace,
|
283
|
+
field_selector=self.job_field_selector,
|
284
|
+
):
|
285
|
+
object = event.get("object")
|
286
|
+
if object.status.succeeded == 1:
|
287
|
+
self._set_status(Status("finished"))
|
288
|
+
self.stop()
|
289
|
+
break
|
290
|
+
elif object.status.failed is not None and object.status.failed >= 1:
|
291
|
+
self._set_status(Status("failed"))
|
292
|
+
self.stop()
|
293
|
+
break
|
294
|
+
|
295
|
+
def _watch_crd(self) -> None:
|
296
|
+
"""Watch for CRD matching the jobname."""
|
297
|
+
for event in self._job_watcher.stream(
|
298
|
+
self.custom_api.list_namespaced_custom_object,
|
299
|
+
namespace=self.namespace,
|
300
|
+
field_selector=self.job_field_selector,
|
301
|
+
group=self.group,
|
302
|
+
version=self.version,
|
303
|
+
plural=self.plural,
|
304
|
+
):
|
305
|
+
object = event.get("object")
|
306
|
+
status = object.get("status")
|
307
|
+
if status is None:
|
308
|
+
continue
|
309
|
+
state = status.get("state")
|
310
|
+
if isinstance(state, dict):
|
311
|
+
raw_state = state.get("phase", "")
|
312
|
+
state = CRD_STATE_DICT.get(raw_state)
|
313
|
+
else:
|
314
|
+
conditions = status.get("conditions")
|
315
|
+
if isinstance(conditions, list):
|
316
|
+
state = _state_from_conditions(conditions)
|
317
|
+
else:
|
318
|
+
# This should never happen.
|
319
|
+
_logger.warning(
|
320
|
+
f"Unexpected conditions type {type(conditions)} "
|
321
|
+
f"for CRD {self.job_field_selector}: {conditions}"
|
322
|
+
)
|
323
|
+
if state is None:
|
324
|
+
continue
|
325
|
+
status = Status(state)
|
326
|
+
self._set_status(status)
|
327
|
+
if status.state in ["finished", "failed", "preempted"]:
|
328
|
+
self.stop()
|
329
|
+
break
|