wandb 0.15.10__py3-none-any.whl → 0.15.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +2 -1
- wandb/apis/public.py +51 -9
- wandb/apis/reports/blocks.py +1 -0
- wandb/cli/cli.py +14 -9
- wandb/env.py +11 -1
- wandb/integration/xgboost/xgboost.py +3 -3
- wandb/proto/v3/wandb_internal_pb2.py +300 -267
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
- wandb/proto/v4/wandb_internal_pb2.py +260 -252
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
- wandb/sdk/artifacts/artifact.py +9 -6
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
- wandb/sdk/data_types/image.py +1 -1
- wandb/sdk/internal/file_stream.py +2 -1
- wandb/sdk/internal/handler.py +24 -20
- wandb/sdk/internal/internal_api.py +9 -1
- wandb/sdk/internal/sender.py +4 -1
- wandb/sdk/internal/system/system_info.py +2 -2
- wandb/sdk/launch/__init__.py +5 -0
- wandb/sdk/launch/{launch.py → _launch.py} +53 -54
- wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
- wandb/sdk/launch/agent/agent.py +36 -18
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
- wandb/sdk/launch/runner/abstract.py +0 -2
- wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +44 -301
- wandb/sdk/launch/runner/local_container.py +5 -2
- wandb/sdk/launch/sweeps/scheduler.py +14 -10
- wandb/sdk/launch/sweeps/utils.py +5 -3
- wandb/sdk/launch/utils.py +3 -1
- wandb/sdk/lib/_settings_toposort_generated.py +5 -0
- wandb/sdk/lib/gql_request.py +3 -0
- wandb/sdk/lib/ipython.py +4 -0
- wandb/sdk/service/service.py +19 -6
- wandb/sdk/wandb_init.py +7 -2
- wandb/sdk/wandb_run.py +2 -5
- wandb/sdk/wandb_settings.py +48 -2
- wandb/util.py +1 -1
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/METADATA +4 -1
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/RECORD +46 -45
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/WHEEL +0 -0
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
- {wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import pprint
|
|
2
2
|
from typing import Any, Dict, List, Optional
|
3
3
|
|
4
4
|
import wandb
|
5
|
-
|
5
|
+
from wandb.apis import public
|
6
6
|
from wandb.apis.internal import Api
|
7
7
|
from wandb.sdk.launch._project_spec import create_project_from_spec
|
8
8
|
from wandb.sdk.launch.builder.build import build_image_from_project
|
@@ -49,39 +49,42 @@ def launch_add(
|
|
49
49
|
"""Enqueue a W&B launch experiment. With either a source uri, job or docker_image.
|
50
50
|
|
51
51
|
Arguments:
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
52
|
+
uri: URI of experiment to run. A wandb run uri or a Git repository URI.
|
53
|
+
job: string reference to a wandb.Job eg: wandb/test/my-job:latest
|
54
|
+
config: A dictionary containing the configuration for the run. May also contain
|
55
|
+
resource specific arguments under the key "resource_args"
|
56
|
+
project: Target project to send launched run to
|
57
|
+
entity: Target entity to send launched run to
|
58
|
+
queue: the name of the queue to enqueue the run to
|
59
|
+
resource: Execution backend for the run: W&B provides built-in support for "local-container" backend
|
60
|
+
entry_point: Entry point to run within the project. Defaults to using the entry point used
|
61
|
+
in the original run for wandb URIs, or main.py for git repository URIs.
|
62
|
+
name: Name run under which to launch the run.
|
63
|
+
version: For Git-based projects, either a commit hash or a branch name.
|
64
|
+
docker_image: The name of the docker image to use for the run.
|
65
|
+
resource_args: Resource related arguments for launching runs onto a remote backend.
|
66
|
+
Will be stored on the constructed launch config under ``resource_args``.
|
67
|
+
run_id: optional string indicating the id of the launched run
|
68
|
+
build: optional flag defaulting to false, requires queue to be set
|
69
|
+
if build, an image is created, creates a job artifact, pushes a reference
|
70
|
+
to that job artifact to queue
|
71
|
+
repository: optional string to control the name of the remote repository, used when
|
72
|
+
pushing images to a registry
|
73
|
+
project_queue: optional string to control the name of the project for the queue. Primarily used
|
74
|
+
for back compatibility with project scoped queues
|
75
75
|
|
76
76
|
|
77
77
|
Example:
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
78
|
+
```python
|
79
|
+
from wandb.sdk.launch import launch_add
|
80
|
+
|
81
|
+
project_uri = "https://github.com/wandb/examples"
|
82
|
+
params = {"alpha": 0.5, "l1_ratio": 0.01}
|
83
|
+
# Run W&B project and create a reproducible docker environment
|
84
|
+
# on a local host
|
85
|
+
api = wandb.apis.internal.Api()
|
86
|
+
launch_add(uri=project_uri, parameters=params)
|
87
|
+
```
|
85
88
|
|
86
89
|
|
87
90
|
Returns:
|
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
11
11
|
import wandb
|
12
12
|
from wandb.apis.internal import Api
|
13
13
|
from wandb.errors import CommError
|
14
|
-
from wandb.sdk.launch.
|
14
|
+
from wandb.sdk.launch._launch_add import launch_add
|
15
15
|
from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
|
16
16
|
from wandb.sdk.launch.runner.local_process import LocalProcessRunner
|
17
17
|
from wandb.sdk.launch.sweeps.scheduler import Scheduler
|
@@ -36,6 +36,8 @@ HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
|
|
36
36
|
|
37
37
|
MAX_RESUME_COUNT = 5
|
38
38
|
|
39
|
+
RUN_INFO_GRACE_PERIOD = 60
|
40
|
+
|
39
41
|
_env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
|
40
42
|
if _env_timeout:
|
41
43
|
try:
|
@@ -301,27 +303,43 @@ class LaunchAgent:
|
|
301
303
|
job_and_run_status.err_stage,
|
302
304
|
fnames,
|
303
305
|
)
|
304
|
-
elif job_and_run_status.completed_status not in ["stopped", "failed"]:
|
305
|
-
_logger.info(
|
306
|
-
"Skipping check for completed run status because run was successful"
|
307
|
-
)
|
308
306
|
elif job_and_run_status.run is not None:
|
309
307
|
run_info = None
|
310
|
-
#
|
311
|
-
#
|
312
|
-
#
|
313
|
-
#
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
308
|
+
# We do some weird stuff here getting run info to check for a
|
309
|
+
# created in run in W&B.
|
310
|
+
#
|
311
|
+
# We retry for 60 seconds with an exponential backoff in case
|
312
|
+
# upsert run is taking a while.
|
313
|
+
#
|
314
|
+
# Sweep runs exist but have no info before they are started
|
315
|
+
# so run_info returned will be None, while normal runs just throw a
|
316
|
+
# comm error.
|
317
|
+
start_time = time.time()
|
318
|
+
interval = 1
|
319
|
+
while True:
|
320
|
+
try:
|
321
|
+
run_info = self._api.get_run_info(
|
322
|
+
self._entity,
|
323
|
+
job_and_run_status.project,
|
324
|
+
job_and_run_status.run_id,
|
325
|
+
)
|
326
|
+
except CommError:
|
327
|
+
pass
|
328
|
+
if (
|
329
|
+
run_info is not None
|
330
|
+
or time.time() - start_time > RUN_INFO_GRACE_PERIOD
|
331
|
+
):
|
332
|
+
break
|
333
|
+
if run_info is None:
|
334
|
+
time.sleep(interval)
|
335
|
+
interval *= 2
|
318
336
|
|
319
|
-
except CommError:
|
320
|
-
pass
|
321
337
|
if run_info is None:
|
322
|
-
_msg = "The submitted run was not successfully started"
|
323
338
|
fnames = None
|
324
|
-
|
339
|
+
if job_and_run_status.completed_status == "finished":
|
340
|
+
_msg = "The submitted job exited successfully but failed to call wandb.init"
|
341
|
+
else:
|
342
|
+
_msg = "The submitted run was not successfully started"
|
325
343
|
logs = job_and_run_status.run.get_logs()
|
326
344
|
if logs:
|
327
345
|
fnames = job_and_run_status.saver.save_contents(
|
@@ -331,7 +349,7 @@ class LaunchAgent:
|
|
331
349
|
job_and_run_status.run_queue_item_id, _msg, "run", fnames
|
332
350
|
)
|
333
351
|
else:
|
334
|
-
_logger.info("Finish thread id had no exception
|
352
|
+
_logger.info(f"Finish thread id {thread_id} had no exception and no run")
|
335
353
|
wandb._sentry.exception(
|
336
354
|
"launch agent called finish thread id on thread without run or exception"
|
337
355
|
)
|
@@ -5,8 +5,6 @@ import sys
|
|
5
5
|
from typing import List, Optional, Union
|
6
6
|
|
7
7
|
import wandb
|
8
|
-
from wandb.sdk.lib import RunDisabled
|
9
|
-
from wandb.sdk.wandb_run import Run
|
10
8
|
|
11
9
|
if sys.version_info >= (3, 8):
|
12
10
|
from typing import Literal
|
@@ -18,7 +16,11 @@ FileSubtypes = Literal["warning", "error"]
|
|
18
16
|
|
19
17
|
class RunQueueItemFileSaver:
|
20
18
|
def __init__(
|
21
|
-
self,
|
19
|
+
self,
|
20
|
+
agent_run: Optional[
|
21
|
+
Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
|
22
|
+
],
|
23
|
+
run_queue_item_id: str,
|
22
24
|
):
|
23
25
|
self.run_queue_item_id = run_queue_item_id
|
24
26
|
self.run = agent_run
|
@@ -26,7 +28,7 @@ class RunQueueItemFileSaver:
|
|
26
28
|
def save_contents(
|
27
29
|
self, contents: str, fname: str, file_sub_type: FileSubtypes
|
28
30
|
) -> Optional[List[str]]:
|
29
|
-
if not isinstance(self.run, Run):
|
31
|
+
if not isinstance(self.run, wandb.sdk.wandb_run.Run):
|
30
32
|
wandb.termwarn("Not saving file contents because agent has no run")
|
31
33
|
return None
|
32
34
|
root_dir = self.run._settings.files_dir
|
@@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Union
|
|
13
13
|
from dockerpycreds.utils import find_executable # type: ignore
|
14
14
|
|
15
15
|
import wandb
|
16
|
-
from wandb import Settings
|
17
16
|
from wandb.apis.internal import Api
|
18
17
|
from wandb.sdk.lib import runid
|
19
18
|
|
@@ -136,7 +135,6 @@ class AbstractRunner(ABC):
|
|
136
135
|
api: Api,
|
137
136
|
backend_config: Dict[str, Any],
|
138
137
|
) -> None:
|
139
|
-
self._settings = Settings()
|
140
138
|
self._api = api
|
141
139
|
self.backend_config = backend_config
|
142
140
|
self._cwd = os.getcwd()
|
@@ -0,0 +1,329 @@
|
|
1
|
+
import logging
|
2
|
+
from threading import Lock, Thread
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
import urllib3
|
6
|
+
from kubernetes import watch # type: ignore # noqa: F401
|
7
|
+
from kubernetes.client import ( # type: ignore # noqa: F401
|
8
|
+
ApiException,
|
9
|
+
BatchV1Api,
|
10
|
+
CoreV1Api,
|
11
|
+
CustomObjectsApi,
|
12
|
+
V1PodStatus,
|
13
|
+
)
|
14
|
+
|
15
|
+
import wandb
|
16
|
+
|
17
|
+
from .abstract import State, Status
|
18
|
+
|
19
|
+
# Dict for mapping possible states of custom objects to the states we want to report
|
20
|
+
# to the agent.
|
21
|
+
CRD_STATE_DICT: Dict[str, State] = {
|
22
|
+
# Starting states.
|
23
|
+
"created": "starting",
|
24
|
+
"pending": "starting",
|
25
|
+
# Running states.
|
26
|
+
"running": "running",
|
27
|
+
"completing": "running",
|
28
|
+
# Finished states.
|
29
|
+
"succeeded": "finished",
|
30
|
+
"completed": "finished",
|
31
|
+
# Failed states.
|
32
|
+
"failed": "failed",
|
33
|
+
"aborted": "failed",
|
34
|
+
"timeout": "failed",
|
35
|
+
"terminated": "failed",
|
36
|
+
# Stopping states.
|
37
|
+
"terminating": "stopping",
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
_logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
|
44
|
+
class SafeWatch:
|
45
|
+
"""Wrapper for the kubernetes watch class that can recover in more situations."""
|
46
|
+
|
47
|
+
def __init__(self, watcher: "watch.Watch") -> None:
|
48
|
+
"""Initialize the SafeWatch."""
|
49
|
+
self._watcher = watcher
|
50
|
+
self._last_seen_resource_version: Optional[str] = None
|
51
|
+
self._stopped = False
|
52
|
+
|
53
|
+
def stream(self, func: Any, *args: Any, **kwargs: Any) -> Any:
|
54
|
+
"""Stream the watcher."""
|
55
|
+
while True:
|
56
|
+
try:
|
57
|
+
for event in self._watcher.stream(
|
58
|
+
func, *args, **kwargs, timeout_seconds=15
|
59
|
+
):
|
60
|
+
if self._stopped:
|
61
|
+
break
|
62
|
+
# Save the resource version so that we can resume the stream
|
63
|
+
# if it breaks.
|
64
|
+
object = event.get("object")
|
65
|
+
if isinstance(object, dict):
|
66
|
+
self._last_seen_resource_version = object.get(
|
67
|
+
"metadata", dict()
|
68
|
+
).get("resourceVersion")
|
69
|
+
else:
|
70
|
+
self._last_seen_resource_version = (
|
71
|
+
object.metadata.resource_version
|
72
|
+
)
|
73
|
+
kwargs["resource_version"] = self._last_seen_resource_version
|
74
|
+
yield event
|
75
|
+
# If stream ends after stop just break
|
76
|
+
if self._stopped:
|
77
|
+
break
|
78
|
+
except urllib3.exceptions.ProtocolError as e:
|
79
|
+
wandb.termwarn(f"Broken event stream: {e}")
|
80
|
+
except ApiException as e:
|
81
|
+
if e.status == 410:
|
82
|
+
# If resource version is too old we need to start over.
|
83
|
+
del kwargs["resource_version"]
|
84
|
+
self._last_seen_resource_version = None
|
85
|
+
except Exception as E:
|
86
|
+
wandb.termerror(f"Unknown exception in event stream: {E}")
|
87
|
+
|
88
|
+
def stop(self) -> None:
|
89
|
+
"""Stop the watcher."""
|
90
|
+
self._watcher.stop()
|
91
|
+
self._stopped = True
|
92
|
+
|
93
|
+
|
94
|
+
def _is_preempted(status: "V1PodStatus") -> bool:
|
95
|
+
"""Check if this pod has been preempted."""
|
96
|
+
if hasattr(status, "conditions") and status.conditions is not None:
|
97
|
+
for condition in status.conditions:
|
98
|
+
if condition.type == "DisruptionTarget" and condition.reason in [
|
99
|
+
"EvictionByEvictionAPI",
|
100
|
+
"PreemptionByScheduler",
|
101
|
+
"TerminationByKubelet",
|
102
|
+
]:
|
103
|
+
return True
|
104
|
+
return False
|
105
|
+
|
106
|
+
|
107
|
+
def _is_container_creating(status: "V1PodStatus") -> bool:
|
108
|
+
"""Check if this pod has started creating containers."""
|
109
|
+
for container_status in status.container_statuses or []:
|
110
|
+
if (
|
111
|
+
container_status.state
|
112
|
+
and container_status.state.waiting
|
113
|
+
and container_status.state.waiting.reason == "ContainerCreating"
|
114
|
+
):
|
115
|
+
return True
|
116
|
+
return False
|
117
|
+
|
118
|
+
|
119
|
+
def _state_from_conditions(conditions: List[Dict[str, Any]]) -> Optional[str]:
|
120
|
+
"""Get the status from the pod conditions."""
|
121
|
+
true_conditions = [
|
122
|
+
c.get("type", "").lower() for c in conditions if c.get("status") == "True"
|
123
|
+
]
|
124
|
+
detected_states = {
|
125
|
+
CRD_STATE_DICT[c] for c in true_conditions if c in CRD_STATE_DICT
|
126
|
+
}
|
127
|
+
for state in ["finished", "failed", "stopping", "running", "starting"]:
|
128
|
+
if state in detected_states:
|
129
|
+
return state
|
130
|
+
return None
|
131
|
+
|
132
|
+
|
133
|
+
class KubernetesRunMonitor:
|
134
|
+
def __init__(
|
135
|
+
self,
|
136
|
+
job_field_selector: str,
|
137
|
+
pod_label_selector: str,
|
138
|
+
namespace: str,
|
139
|
+
batch_api: "BatchV1Api",
|
140
|
+
core_api: "CoreV1Api",
|
141
|
+
custom_api: "CustomObjectsApi" = None,
|
142
|
+
group: Optional[str] = None,
|
143
|
+
version: Optional[str] = None,
|
144
|
+
plural: Optional[str] = None,
|
145
|
+
) -> None:
|
146
|
+
"""Initialize KubernetesRunMonitor.
|
147
|
+
|
148
|
+
If a custom api is provided, the group, version, and plural arguments must also
|
149
|
+
be provided. These are used to query the custom api for a launched custom
|
150
|
+
object (CRD). Group, version, and plural in this context refer to the
|
151
|
+
Kubernetes API group, version, and plural for the CRD. For more information
|
152
|
+
see: https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/
|
153
|
+
|
154
|
+
The run monitor starts two threads to watch for pods and jobs/crds matching the
|
155
|
+
provided selectors. The status is set to "starting" when the run monitor is
|
156
|
+
initialized. The status is set to "running" when a pod matching the pod selector
|
157
|
+
is found with a status of "Running" or has a container with a status of
|
158
|
+
"ContainerCreating". The status is set to "finished" when a job matching the job
|
159
|
+
selector is found with a status of "Succeeded". The status is set to "failed"
|
160
|
+
when a job matching the job selector is found with a status of "Failed" or a pod
|
161
|
+
matching the pod selector is found with a status of "Failed". The status is set
|
162
|
+
to "preempted" when a pod matching the pod selector is found with a condition
|
163
|
+
type of "DisruptionTarget" and a reason of "EvictionByEvictionAPI",
|
164
|
+
"PreemptionByScheduler", or "TerminationByKubelet".
|
165
|
+
|
166
|
+
The logic for the CRD is similar to the logic for the job, but we inspect
|
167
|
+
both the phase of the CRD and the conditions since some CRDs do not have a
|
168
|
+
phase field.
|
169
|
+
|
170
|
+
Arguments:
|
171
|
+
job_field_selector: The field selector for the job or crd.
|
172
|
+
pod_label_selector: The label selector for the pods.
|
173
|
+
namespace: The namespace to monitor.
|
174
|
+
batch_api: The batch api client.
|
175
|
+
core_api: The core api client.
|
176
|
+
custom_api: The custom api client.
|
177
|
+
group: The group of the CRD.
|
178
|
+
version: The version of the CRD.
|
179
|
+
plural: The plural of the CRD.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
None.
|
183
|
+
"""
|
184
|
+
self.pod_label_selector = pod_label_selector
|
185
|
+
self.job_field_selector = job_field_selector
|
186
|
+
self.namespace = namespace
|
187
|
+
self.batch_api = batch_api
|
188
|
+
self.core_api = core_api
|
189
|
+
self.custom_api = custom_api
|
190
|
+
self.group = group
|
191
|
+
self.version = version
|
192
|
+
self.plural = plural
|
193
|
+
|
194
|
+
self._status_lock = Lock()
|
195
|
+
self._status = Status("starting")
|
196
|
+
|
197
|
+
# Only one of the job or crd watchers will be used.
|
198
|
+
self._watch_job_thread = Thread(target=self._watch_job, daemon=True)
|
199
|
+
self._watch_crd_thread = Thread(target=self._watch_crd, daemon=True)
|
200
|
+
|
201
|
+
self._watch_pods_thread = Thread(target=self._watch_pods, daemon=True)
|
202
|
+
|
203
|
+
self._job_watcher = SafeWatch(watch.Watch())
|
204
|
+
self._pod_watcher = SafeWatch(watch.Watch())
|
205
|
+
|
206
|
+
def start(self) -> None:
|
207
|
+
"""Start the run monitor."""
|
208
|
+
if self.custom_api is None:
|
209
|
+
self._watch_job_thread.start()
|
210
|
+
else:
|
211
|
+
self._watch_crd_thread.start()
|
212
|
+
self._watch_pods_thread.start()
|
213
|
+
|
214
|
+
def stop(self) -> None:
|
215
|
+
"""Stop the run monitor."""
|
216
|
+
self._job_watcher.stop()
|
217
|
+
self._pod_watcher.stop()
|
218
|
+
|
219
|
+
def _set_status(self, status: Status) -> None:
|
220
|
+
"""Set the run status."""
|
221
|
+
with self._status_lock:
|
222
|
+
self._status = status
|
223
|
+
|
224
|
+
def get_status(self) -> Status:
|
225
|
+
"""Get the run status."""
|
226
|
+
with self._status_lock:
|
227
|
+
# Each time this is called we verify that our watchers are active.
|
228
|
+
if self._status.state in ["running", "starting"]:
|
229
|
+
if self.custom_api is None:
|
230
|
+
if not self._watch_job_thread.is_alive():
|
231
|
+
wandb.termwarn(
|
232
|
+
f"Job watcher thread is dead for {self.job_field_selector}"
|
233
|
+
)
|
234
|
+
self._watch_job_thread = Thread(
|
235
|
+
target=self._watch_job, daemon=True
|
236
|
+
)
|
237
|
+
self._watch_job_thread.start()
|
238
|
+
else:
|
239
|
+
if not self._watch_crd_thread.is_alive():
|
240
|
+
wandb.termwarn(
|
241
|
+
f"CRD watcher thread is dead for {self.job_field_selector}"
|
242
|
+
)
|
243
|
+
self._watch_crd_thread = Thread(
|
244
|
+
target=self._watch_crd, daemon=True
|
245
|
+
)
|
246
|
+
self._watch_crd_thread.start()
|
247
|
+
if not self._watch_pods_thread.is_alive():
|
248
|
+
wandb.termwarn(
|
249
|
+
f"Pod watcher thread is dead for {self.pod_label_selector}"
|
250
|
+
)
|
251
|
+
self._watch_pods_thread = Thread(
|
252
|
+
target=self._watch_pods, daemon=True
|
253
|
+
)
|
254
|
+
self._watch_pods_thread.start()
|
255
|
+
return self._status
|
256
|
+
|
257
|
+
def _watch_pods(self) -> None:
|
258
|
+
"""Watch for pods created matching the jobname."""
|
259
|
+
# Stream with no timeout polling for pod status updates
|
260
|
+
for event in self._pod_watcher.stream(
|
261
|
+
self.core_api.list_namespaced_pod,
|
262
|
+
namespace=self.namespace,
|
263
|
+
label_selector=self.pod_label_selector,
|
264
|
+
):
|
265
|
+
object = event.get("object")
|
266
|
+
# Sometimes ADDED events will be missing field.
|
267
|
+
if not hasattr(object, "status"):
|
268
|
+
continue
|
269
|
+
if object.status.phase == "Running":
|
270
|
+
self._set_status(Status("running"))
|
271
|
+
if _is_preempted(object.status):
|
272
|
+
self._set_status(Status("preempted"))
|
273
|
+
self.stop()
|
274
|
+
break
|
275
|
+
if _is_container_creating(object.status):
|
276
|
+
self._set_status(Status("running"))
|
277
|
+
|
278
|
+
def _watch_job(self) -> None:
|
279
|
+
"""Watch for job matching the jobname."""
|
280
|
+
for event in self._job_watcher.stream(
|
281
|
+
self.batch_api.list_namespaced_job,
|
282
|
+
namespace=self.namespace,
|
283
|
+
field_selector=self.job_field_selector,
|
284
|
+
):
|
285
|
+
object = event.get("object")
|
286
|
+
if object.status.succeeded == 1:
|
287
|
+
self._set_status(Status("finished"))
|
288
|
+
self.stop()
|
289
|
+
break
|
290
|
+
elif object.status.failed is not None and object.status.failed >= 1:
|
291
|
+
self._set_status(Status("failed"))
|
292
|
+
self.stop()
|
293
|
+
break
|
294
|
+
|
295
|
+
def _watch_crd(self) -> None:
|
296
|
+
"""Watch for CRD matching the jobname."""
|
297
|
+
for event in self._job_watcher.stream(
|
298
|
+
self.custom_api.list_namespaced_custom_object,
|
299
|
+
namespace=self.namespace,
|
300
|
+
field_selector=self.job_field_selector,
|
301
|
+
group=self.group,
|
302
|
+
version=self.version,
|
303
|
+
plural=self.plural,
|
304
|
+
):
|
305
|
+
object = event.get("object")
|
306
|
+
status = object.get("status")
|
307
|
+
if status is None:
|
308
|
+
continue
|
309
|
+
state = status.get("state")
|
310
|
+
if isinstance(state, dict):
|
311
|
+
raw_state = state.get("phase", "")
|
312
|
+
state = CRD_STATE_DICT.get(raw_state)
|
313
|
+
else:
|
314
|
+
conditions = status.get("conditions")
|
315
|
+
if isinstance(conditions, list):
|
316
|
+
state = _state_from_conditions(conditions)
|
317
|
+
else:
|
318
|
+
# This should never happen.
|
319
|
+
_logger.warning(
|
320
|
+
f"Unexpected conditions type {type(conditions)} "
|
321
|
+
f"for CRD {self.job_field_selector}: {conditions}"
|
322
|
+
)
|
323
|
+
if state is None:
|
324
|
+
continue
|
325
|
+
status = Status(state)
|
326
|
+
self._set_status(status)
|
327
|
+
if status.state in ["finished", "failed", "preempted"]:
|
328
|
+
self.stop()
|
329
|
+
break
|