wandb 0.15.10__py3-none-any.whl → 0.15.11__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -1
- wandb/apis/public.py +51 -9
- wandb/apis/reports/blocks.py +1 -0
- wandb/cli/cli.py +14 -9
- wandb/env.py +11 -1
- wandb/integration/xgboost/xgboost.py +3 -3
- wandb/proto/v3/wandb_internal_pb2.py +300 -267
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
- wandb/proto/v4/wandb_internal_pb2.py +260 -252
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
- wandb/sdk/artifacts/artifact.py +9 -6
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
- wandb/sdk/data_types/image.py +1 -1
- wandb/sdk/internal/file_stream.py +2 -1
- wandb/sdk/internal/handler.py +24 -20
- wandb/sdk/internal/internal_api.py +9 -1
- wandb/sdk/internal/sender.py +4 -1
- wandb/sdk/internal/system/system_info.py +2 -2
- wandb/sdk/launch/__init__.py +5 -0
- wandb/sdk/launch/{launch.py → _launch.py} +53 -54
- wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
- wandb/sdk/launch/agent/agent.py +36 -18
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
- wandb/sdk/launch/runner/abstract.py +0 -2
- wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +44 -301
- wandb/sdk/launch/runner/local_container.py +5 -2
- wandb/sdk/launch/sweeps/scheduler.py +14 -10
- wandb/sdk/launch/sweeps/utils.py +5 -3
- wandb/sdk/launch/utils.py +3 -1
- wandb/sdk/lib/_settings_toposort_generated.py +5 -0
- wandb/sdk/lib/gql_request.py +3 -0
- wandb/sdk/lib/ipython.py +4 -0
- wandb/sdk/service/service.py +19 -6
- wandb/sdk/wandb_init.py +7 -2
- wandb/sdk/wandb_run.py +2 -5
- wandb/sdk/wandb_settings.py +48 -2
- wandb/util.py +1 -1
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/METADATA +4 -1
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/RECORD +46 -45
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/WHEEL +0 -0
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import pprint
|
|
2
2
|
from typing import Any, Dict, List, Optional
|
3
3
|
|
4
4
|
import wandb
|
5
|
-
|
5
|
+
from wandb.apis import public
|
6
6
|
from wandb.apis.internal import Api
|
7
7
|
from wandb.sdk.launch._project_spec import create_project_from_spec
|
8
8
|
from wandb.sdk.launch.builder.build import build_image_from_project
|
@@ -49,39 +49,42 @@ def launch_add(
|
|
49
49
|
"""Enqueue a W&B launch experiment. With either a source uri, job or docker_image.
|
50
50
|
|
51
51
|
Arguments:
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
52
|
+
uri: URI of experiment to run. A wandb run uri or a Git repository URI.
|
53
|
+
job: string reference to a wandb.Job eg: wandb/test/my-job:latest
|
54
|
+
config: A dictionary containing the configuration for the run. May also contain
|
55
|
+
resource specific arguments under the key "resource_args"
|
56
|
+
project: Target project to send launched run to
|
57
|
+
entity: Target entity to send launched run to
|
58
|
+
queue: the name of the queue to enqueue the run to
|
59
|
+
resource: Execution backend for the run: W&B provides built-in support for "local-container" backend
|
60
|
+
entry_point: Entry point to run within the project. Defaults to using the entry point used
|
61
|
+
in the original run for wandb URIs, or main.py for git repository URIs.
|
62
|
+
name: Name run under which to launch the run.
|
63
|
+
version: For Git-based projects, either a commit hash or a branch name.
|
64
|
+
docker_image: The name of the docker image to use for the run.
|
65
|
+
resource_args: Resource related arguments for launching runs onto a remote backend.
|
66
|
+
Will be stored on the constructed launch config under ``resource_args``.
|
67
|
+
run_id: optional string indicating the id of the launched run
|
68
|
+
build: optional flag defaulting to false, requires queue to be set
|
69
|
+
if build, an image is created, creates a job artifact, pushes a reference
|
70
|
+
to that job artifact to queue
|
71
|
+
repository: optional string to control the name of the remote repository, used when
|
72
|
+
pushing images to a registry
|
73
|
+
project_queue: optional string to control the name of the project for the queue. Primarily used
|
74
|
+
for back compatibility with project scoped queues
|
75
75
|
|
76
76
|
|
77
77
|
Example:
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
78
|
+
```python
|
79
|
+
from wandb.sdk.launch import launch_add
|
80
|
+
|
81
|
+
project_uri = "https://github.com/wandb/examples"
|
82
|
+
params = {"alpha": 0.5, "l1_ratio": 0.01}
|
83
|
+
# Run W&B project and create a reproducible docker environment
|
84
|
+
# on a local host
|
85
|
+
api = wandb.apis.internal.Api()
|
86
|
+
launch_add(uri=project_uri, parameters=params)
|
87
|
+
```
|
85
88
|
|
86
89
|
|
87
90
|
Returns:
|
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
11
11
|
import wandb
|
12
12
|
from wandb.apis.internal import Api
|
13
13
|
from wandb.errors import CommError
|
14
|
-
from wandb.sdk.launch.
|
14
|
+
from wandb.sdk.launch._launch_add import launch_add
|
15
15
|
from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
|
16
16
|
from wandb.sdk.launch.runner.local_process import LocalProcessRunner
|
17
17
|
from wandb.sdk.launch.sweeps.scheduler import Scheduler
|
@@ -36,6 +36,8 @@ HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
|
|
36
36
|
|
37
37
|
MAX_RESUME_COUNT = 5
|
38
38
|
|
39
|
+
RUN_INFO_GRACE_PERIOD = 60
|
40
|
+
|
39
41
|
_env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
|
40
42
|
if _env_timeout:
|
41
43
|
try:
|
@@ -301,27 +303,43 @@ class LaunchAgent:
|
|
301
303
|
job_and_run_status.err_stage,
|
302
304
|
fnames,
|
303
305
|
)
|
304
|
-
elif job_and_run_status.completed_status not in ["stopped", "failed"]:
|
305
|
-
_logger.info(
|
306
|
-
"Skipping check for completed run status because run was successful"
|
307
|
-
)
|
308
306
|
elif job_and_run_status.run is not None:
|
309
307
|
run_info = None
|
310
|
-
#
|
311
|
-
#
|
312
|
-
#
|
313
|
-
#
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
308
|
+
# We do some weird stuff here getting run info to check for a
|
309
|
+
# created in run in W&B.
|
310
|
+
#
|
311
|
+
# We retry for 60 seconds with an exponential backoff in case
|
312
|
+
# upsert run is taking a while.
|
313
|
+
#
|
314
|
+
# Sweep runs exist but have no info before they are started
|
315
|
+
# so run_info returned will be None, while normal runs just throw a
|
316
|
+
# comm error.
|
317
|
+
start_time = time.time()
|
318
|
+
interval = 1
|
319
|
+
while True:
|
320
|
+
try:
|
321
|
+
run_info = self._api.get_run_info(
|
322
|
+
self._entity,
|
323
|
+
job_and_run_status.project,
|
324
|
+
job_and_run_status.run_id,
|
325
|
+
)
|
326
|
+
except CommError:
|
327
|
+
pass
|
328
|
+
if (
|
329
|
+
run_info is not None
|
330
|
+
or time.time() - start_time > RUN_INFO_GRACE_PERIOD
|
331
|
+
):
|
332
|
+
break
|
333
|
+
if run_info is None:
|
334
|
+
time.sleep(interval)
|
335
|
+
interval *= 2
|
318
336
|
|
319
|
-
except CommError:
|
320
|
-
pass
|
321
337
|
if run_info is None:
|
322
|
-
_msg = "The submitted run was not successfully started"
|
323
338
|
fnames = None
|
324
|
-
|
339
|
+
if job_and_run_status.completed_status == "finished":
|
340
|
+
_msg = "The submitted job exited successfully but failed to call wandb.init"
|
341
|
+
else:
|
342
|
+
_msg = "The submitted run was not successfully started"
|
325
343
|
logs = job_and_run_status.run.get_logs()
|
326
344
|
if logs:
|
327
345
|
fnames = job_and_run_status.saver.save_contents(
|
@@ -331,7 +349,7 @@ class LaunchAgent:
|
|
331
349
|
job_and_run_status.run_queue_item_id, _msg, "run", fnames
|
332
350
|
)
|
333
351
|
else:
|
334
|
-
_logger.info("Finish thread id had no exception
|
352
|
+
_logger.info(f"Finish thread id {thread_id} had no exception and no run")
|
335
353
|
wandb._sentry.exception(
|
336
354
|
"launch agent called finish thread id on thread without run or exception"
|
337
355
|
)
|
@@ -5,8 +5,6 @@ import sys
|
|
5
5
|
from typing import List, Optional, Union
|
6
6
|
|
7
7
|
import wandb
|
8
|
-
from wandb.sdk.lib import RunDisabled
|
9
|
-
from wandb.sdk.wandb_run import Run
|
10
8
|
|
11
9
|
if sys.version_info >= (3, 8):
|
12
10
|
from typing import Literal
|
@@ -18,7 +16,11 @@ FileSubtypes = Literal["warning", "error"]
|
|
18
16
|
|
19
17
|
class RunQueueItemFileSaver:
|
20
18
|
def __init__(
|
21
|
-
self,
|
19
|
+
self,
|
20
|
+
agent_run: Optional[
|
21
|
+
Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
|
22
|
+
],
|
23
|
+
run_queue_item_id: str,
|
22
24
|
):
|
23
25
|
self.run_queue_item_id = run_queue_item_id
|
24
26
|
self.run = agent_run
|
@@ -26,7 +28,7 @@ class RunQueueItemFileSaver:
|
|
26
28
|
def save_contents(
|
27
29
|
self, contents: str, fname: str, file_sub_type: FileSubtypes
|
28
30
|
) -> Optional[List[str]]:
|
29
|
-
if not isinstance(self.run, Run):
|
31
|
+
if not isinstance(self.run, wandb.sdk.wandb_run.Run):
|
30
32
|
wandb.termwarn("Not saving file contents because agent has no run")
|
31
33
|
return None
|
32
34
|
root_dir = self.run._settings.files_dir
|
@@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Union
|
|
13
13
|
from dockerpycreds.utils import find_executable # type: ignore
|
14
14
|
|
15
15
|
import wandb
|
16
|
-
from wandb import Settings
|
17
16
|
from wandb.apis.internal import Api
|
18
17
|
from wandb.sdk.lib import runid
|
19
18
|
|
@@ -136,7 +135,6 @@ class AbstractRunner(ABC):
|
|
136
135
|
api: Api,
|
137
136
|
backend_config: Dict[str, Any],
|
138
137
|
) -> None:
|
139
|
-
self._settings = Settings()
|
140
138
|
self._api = api
|
141
139
|
self.backend_config = backend_config
|
142
140
|
self._cwd = os.getcwd()
|
@@ -0,0 +1,329 @@
|
|
1
|
+
import logging
|
2
|
+
from threading import Lock, Thread
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
import urllib3
|
6
|
+
from kubernetes import watch # type: ignore # noqa: F401
|
7
|
+
from kubernetes.client import ( # type: ignore # noqa: F401
|
8
|
+
ApiException,
|
9
|
+
BatchV1Api,
|
10
|
+
CoreV1Api,
|
11
|
+
CustomObjectsApi,
|
12
|
+
V1PodStatus,
|
13
|
+
)
|
14
|
+
|
15
|
+
import wandb
|
16
|
+
|
17
|
+
from .abstract import State, Status
|
18
|
+
|
19
|
+
# Dict for mapping possible states of custom objects to the states we want to report
|
20
|
+
# to the agent.
|
21
|
+
CRD_STATE_DICT: Dict[str, State] = {
|
22
|
+
# Starting states.
|
23
|
+
"created": "starting",
|
24
|
+
"pending": "starting",
|
25
|
+
# Running states.
|
26
|
+
"running": "running",
|
27
|
+
"completing": "running",
|
28
|
+
# Finished states.
|
29
|
+
"succeeded": "finished",
|
30
|
+
"completed": "finished",
|
31
|
+
# Failed states.
|
32
|
+
"failed": "failed",
|
33
|
+
"aborted": "failed",
|
34
|
+
"timeout": "failed",
|
35
|
+
"terminated": "failed",
|
36
|
+
# Stopping states.
|
37
|
+
"terminating": "stopping",
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
_logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
|
44
|
+
class SafeWatch:
|
45
|
+
"""Wrapper for the kubernetes watch class that can recover in more situations."""
|
46
|
+
|
47
|
+
def __init__(self, watcher: "watch.Watch") -> None:
|
48
|
+
"""Initialize the SafeWatch."""
|
49
|
+
self._watcher = watcher
|
50
|
+
self._last_seen_resource_version: Optional[str] = None
|
51
|
+
self._stopped = False
|
52
|
+
|
53
|
+
def stream(self, func: Any, *args: Any, **kwargs: Any) -> Any:
|
54
|
+
"""Stream the watcher."""
|
55
|
+
while True:
|
56
|
+
try:
|
57
|
+
for event in self._watcher.stream(
|
58
|
+
func, *args, **kwargs, timeout_seconds=15
|
59
|
+
):
|
60
|
+
if self._stopped:
|
61
|
+
break
|
62
|
+
# Save the resource version so that we can resume the stream
|
63
|
+
# if it breaks.
|
64
|
+
object = event.get("object")
|
65
|
+
if isinstance(object, dict):
|
66
|
+
self._last_seen_resource_version = object.get(
|
67
|
+
"metadata", dict()
|
68
|
+
).get("resourceVersion")
|
69
|
+
else:
|
70
|
+
self._last_seen_resource_version = (
|
71
|
+
object.metadata.resource_version
|
72
|
+
)
|
73
|
+
kwargs["resource_version"] = self._last_seen_resource_version
|
74
|
+
yield event
|
75
|
+
# If stream ends after stop just break
|
76
|
+
if self._stopped:
|
77
|
+
break
|
78
|
+
except urllib3.exceptions.ProtocolError as e:
|
79
|
+
wandb.termwarn(f"Broken event stream: {e}")
|
80
|
+
except ApiException as e:
|
81
|
+
if e.status == 410:
|
82
|
+
# If resource version is too old we need to start over.
|
83
|
+
del kwargs["resource_version"]
|
84
|
+
self._last_seen_resource_version = None
|
85
|
+
except Exception as E:
|
86
|
+
wandb.termerror(f"Unknown exception in event stream: {E}")
|
87
|
+
|
88
|
+
def stop(self) -> None:
|
89
|
+
"""Stop the watcher."""
|
90
|
+
self._watcher.stop()
|
91
|
+
self._stopped = True
|
92
|
+
|
93
|
+
|
94
|
+
def _is_preempted(status: "V1PodStatus") -> bool:
|
95
|
+
"""Check if this pod has been preempted."""
|
96
|
+
if hasattr(status, "conditions") and status.conditions is not None:
|
97
|
+
for condition in status.conditions:
|
98
|
+
if condition.type == "DisruptionTarget" and condition.reason in [
|
99
|
+
"EvictionByEvictionAPI",
|
100
|
+
"PreemptionByScheduler",
|
101
|
+
"TerminationByKubelet",
|
102
|
+
]:
|
103
|
+
return True
|
104
|
+
return False
|
105
|
+
|
106
|
+
|
107
|
+
def _is_container_creating(status: "V1PodStatus") -> bool:
|
108
|
+
"""Check if this pod has started creating containers."""
|
109
|
+
for container_status in status.container_statuses or []:
|
110
|
+
if (
|
111
|
+
container_status.state
|
112
|
+
and container_status.state.waiting
|
113
|
+
and container_status.state.waiting.reason == "ContainerCreating"
|
114
|
+
):
|
115
|
+
return True
|
116
|
+
return False
|
117
|
+
|
118
|
+
|
119
|
+
def _state_from_conditions(conditions: List[Dict[str, Any]]) -> Optional[str]:
|
120
|
+
"""Get the status from the pod conditions."""
|
121
|
+
true_conditions = [
|
122
|
+
c.get("type", "").lower() for c in conditions if c.get("status") == "True"
|
123
|
+
]
|
124
|
+
detected_states = {
|
125
|
+
CRD_STATE_DICT[c] for c in true_conditions if c in CRD_STATE_DICT
|
126
|
+
}
|
127
|
+
for state in ["finished", "failed", "stopping", "running", "starting"]:
|
128
|
+
if state in detected_states:
|
129
|
+
return state
|
130
|
+
return None
|
131
|
+
|
132
|
+
|
133
|
+
class KubernetesRunMonitor:
|
134
|
+
def __init__(
|
135
|
+
self,
|
136
|
+
job_field_selector: str,
|
137
|
+
pod_label_selector: str,
|
138
|
+
namespace: str,
|
139
|
+
batch_api: "BatchV1Api",
|
140
|
+
core_api: "CoreV1Api",
|
141
|
+
custom_api: "CustomObjectsApi" = None,
|
142
|
+
group: Optional[str] = None,
|
143
|
+
version: Optional[str] = None,
|
144
|
+
plural: Optional[str] = None,
|
145
|
+
) -> None:
|
146
|
+
"""Initialize KubernetesRunMonitor.
|
147
|
+
|
148
|
+
If a custom api is provided, the group, version, and plural arguments must also
|
149
|
+
be provided. These are used to query the custom api for a launched custom
|
150
|
+
object (CRD). Group, version, and plural in this context refer to the
|
151
|
+
Kubernetes API group, version, and plural for the CRD. For more information
|
152
|
+
see: https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/
|
153
|
+
|
154
|
+
The run monitor starts two threads to watch for pods and jobs/crds matching the
|
155
|
+
provided selectors. The status is set to "starting" when the run monitor is
|
156
|
+
initialized. The status is set to "running" when a pod matching the pod selector
|
157
|
+
is found with a status of "Running" or has a container with a status of
|
158
|
+
"ContainerCreating". The status is set to "finished" when a job matching the job
|
159
|
+
selector is found with a status of "Succeeded". The status is set to "failed"
|
160
|
+
when a job matching the job selector is found with a status of "Failed" or a pod
|
161
|
+
matching the pod selector is found with a status of "Failed". The status is set
|
162
|
+
to "preempted" when a pod matching the pod selector is found with a condition
|
163
|
+
type of "DisruptionTarget" and a reason of "EvictionByEvictionAPI",
|
164
|
+
"PreemptionByScheduler", or "TerminationByKubelet".
|
165
|
+
|
166
|
+
The logic for the CRD is similar to the logic for the job, but we inspect
|
167
|
+
both the phase of the CRD and the conditions since some CRDs do not have a
|
168
|
+
phase field.
|
169
|
+
|
170
|
+
Arguments:
|
171
|
+
job_field_selector: The field selector for the job or crd.
|
172
|
+
pod_label_selector: The label selector for the pods.
|
173
|
+
namespace: The namespace to monitor.
|
174
|
+
batch_api: The batch api client.
|
175
|
+
core_api: The core api client.
|
176
|
+
custom_api: The custom api client.
|
177
|
+
group: The group of the CRD.
|
178
|
+
version: The version of the CRD.
|
179
|
+
plural: The plural of the CRD.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
None.
|
183
|
+
"""
|
184
|
+
self.pod_label_selector = pod_label_selector
|
185
|
+
self.job_field_selector = job_field_selector
|
186
|
+
self.namespace = namespace
|
187
|
+
self.batch_api = batch_api
|
188
|
+
self.core_api = core_api
|
189
|
+
self.custom_api = custom_api
|
190
|
+
self.group = group
|
191
|
+
self.version = version
|
192
|
+
self.plural = plural
|
193
|
+
|
194
|
+
self._status_lock = Lock()
|
195
|
+
self._status = Status("starting")
|
196
|
+
|
197
|
+
# Only one of the job or crd watchers will be used.
|
198
|
+
self._watch_job_thread = Thread(target=self._watch_job, daemon=True)
|
199
|
+
self._watch_crd_thread = Thread(target=self._watch_crd, daemon=True)
|
200
|
+
|
201
|
+
self._watch_pods_thread = Thread(target=self._watch_pods, daemon=True)
|
202
|
+
|
203
|
+
self._job_watcher = SafeWatch(watch.Watch())
|
204
|
+
self._pod_watcher = SafeWatch(watch.Watch())
|
205
|
+
|
206
|
+
def start(self) -> None:
|
207
|
+
"""Start the run monitor."""
|
208
|
+
if self.custom_api is None:
|
209
|
+
self._watch_job_thread.start()
|
210
|
+
else:
|
211
|
+
self._watch_crd_thread.start()
|
212
|
+
self._watch_pods_thread.start()
|
213
|
+
|
214
|
+
def stop(self) -> None:
|
215
|
+
"""Stop the run monitor."""
|
216
|
+
self._job_watcher.stop()
|
217
|
+
self._pod_watcher.stop()
|
218
|
+
|
219
|
+
def _set_status(self, status: Status) -> None:
|
220
|
+
"""Set the run status."""
|
221
|
+
with self._status_lock:
|
222
|
+
self._status = status
|
223
|
+
|
224
|
+
def get_status(self) -> Status:
|
225
|
+
"""Get the run status."""
|
226
|
+
with self._status_lock:
|
227
|
+
# Each time this is called we verify that our watchers are active.
|
228
|
+
if self._status.state in ["running", "starting"]:
|
229
|
+
if self.custom_api is None:
|
230
|
+
if not self._watch_job_thread.is_alive():
|
231
|
+
wandb.termwarn(
|
232
|
+
f"Job watcher thread is dead for {self.job_field_selector}"
|
233
|
+
)
|
234
|
+
self._watch_job_thread = Thread(
|
235
|
+
target=self._watch_job, daemon=True
|
236
|
+
)
|
237
|
+
self._watch_job_thread.start()
|
238
|
+
else:
|
239
|
+
if not self._watch_crd_thread.is_alive():
|
240
|
+
wandb.termwarn(
|
241
|
+
f"CRD watcher thread is dead for {self.job_field_selector}"
|
242
|
+
)
|
243
|
+
self._watch_crd_thread = Thread(
|
244
|
+
target=self._watch_crd, daemon=True
|
245
|
+
)
|
246
|
+
self._watch_crd_thread.start()
|
247
|
+
if not self._watch_pods_thread.is_alive():
|
248
|
+
wandb.termwarn(
|
249
|
+
f"Pod watcher thread is dead for {self.pod_label_selector}"
|
250
|
+
)
|
251
|
+
self._watch_pods_thread = Thread(
|
252
|
+
target=self._watch_pods, daemon=True
|
253
|
+
)
|
254
|
+
self._watch_pods_thread.start()
|
255
|
+
return self._status
|
256
|
+
|
257
|
+
def _watch_pods(self) -> None:
|
258
|
+
"""Watch for pods created matching the jobname."""
|
259
|
+
# Stream with no timeout polling for pod status updates
|
260
|
+
for event in self._pod_watcher.stream(
|
261
|
+
self.core_api.list_namespaced_pod,
|
262
|
+
namespace=self.namespace,
|
263
|
+
label_selector=self.pod_label_selector,
|
264
|
+
):
|
265
|
+
object = event.get("object")
|
266
|
+
# Sometimes ADDED events will be missing field.
|
267
|
+
if not hasattr(object, "status"):
|
268
|
+
continue
|
269
|
+
if object.status.phase == "Running":
|
270
|
+
self._set_status(Status("running"))
|
271
|
+
if _is_preempted(object.status):
|
272
|
+
self._set_status(Status("preempted"))
|
273
|
+
self.stop()
|
274
|
+
break
|
275
|
+
if _is_container_creating(object.status):
|
276
|
+
self._set_status(Status("running"))
|
277
|
+
|
278
|
+
def _watch_job(self) -> None:
|
279
|
+
"""Watch for job matching the jobname."""
|
280
|
+
for event in self._job_watcher.stream(
|
281
|
+
self.batch_api.list_namespaced_job,
|
282
|
+
namespace=self.namespace,
|
283
|
+
field_selector=self.job_field_selector,
|
284
|
+
):
|
285
|
+
object = event.get("object")
|
286
|
+
if object.status.succeeded == 1:
|
287
|
+
self._set_status(Status("finished"))
|
288
|
+
self.stop()
|
289
|
+
break
|
290
|
+
elif object.status.failed is not None and object.status.failed >= 1:
|
291
|
+
self._set_status(Status("failed"))
|
292
|
+
self.stop()
|
293
|
+
break
|
294
|
+
|
295
|
+
def _watch_crd(self) -> None:
|
296
|
+
"""Watch for CRD matching the jobname."""
|
297
|
+
for event in self._job_watcher.stream(
|
298
|
+
self.custom_api.list_namespaced_custom_object,
|
299
|
+
namespace=self.namespace,
|
300
|
+
field_selector=self.job_field_selector,
|
301
|
+
group=self.group,
|
302
|
+
version=self.version,
|
303
|
+
plural=self.plural,
|
304
|
+
):
|
305
|
+
object = event.get("object")
|
306
|
+
status = object.get("status")
|
307
|
+
if status is None:
|
308
|
+
continue
|
309
|
+
state = status.get("state")
|
310
|
+
if isinstance(state, dict):
|
311
|
+
raw_state = state.get("phase", "")
|
312
|
+
state = CRD_STATE_DICT.get(raw_state)
|
313
|
+
else:
|
314
|
+
conditions = status.get("conditions")
|
315
|
+
if isinstance(conditions, list):
|
316
|
+
state = _state_from_conditions(conditions)
|
317
|
+
else:
|
318
|
+
# This should never happen.
|
319
|
+
_logger.warning(
|
320
|
+
f"Unexpected conditions type {type(conditions)} "
|
321
|
+
f"for CRD {self.job_field_selector}: {conditions}"
|
322
|
+
)
|
323
|
+
if state is None:
|
324
|
+
continue
|
325
|
+
status = Status(state)
|
326
|
+
self._set_status(status)
|
327
|
+
if status.state in ["finished", "failed", "preempted"]:
|
328
|
+
self.stop()
|
329
|
+
break
|