konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""Batch job execution via k8s jobsets
|
|
2
|
+
https://jobset.sigs.k8s.io/
|
|
3
|
+
https://kueue.sigs.k8s.io/docs/tasks/run/jobsets/
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
import typing
|
|
9
|
+
from typing import Any, Dict, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
import colorama
|
|
12
|
+
|
|
13
|
+
if typing.TYPE_CHECKING:
|
|
14
|
+
import konduktor
|
|
15
|
+
from konduktor.data import storage as storage_lib
|
|
16
|
+
|
|
17
|
+
from konduktor import config, logging
|
|
18
|
+
from konduktor.backends import backend, jobset_utils, pod_utils
|
|
19
|
+
from konduktor.utils import kubernetes_utils, log_utils, rich_utils, ux_utils
|
|
20
|
+
|
|
21
|
+
Path = str
|
|
22
|
+
logger = logging.get_logger(__file__)
|
|
23
|
+
|
|
24
|
+
POLL_INTERVAL = 5
|
|
25
|
+
DEFAULT_ATTACH_TIMEOUT = 86400 # 1 day
|
|
26
|
+
FLUSH_LOGS_TIMEOUT = 5
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class JobsetError(Exception):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _raise_job_error(job):
|
|
34
|
+
"""Checks a jobs conditions and statuses for error"""
|
|
35
|
+
for condition in job.status.conditions:
|
|
36
|
+
if 'ConfigIssue' in condition.message:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
'Job failed with '
|
|
39
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
|
|
40
|
+
f'ConfigIssue: ErrImagePull.{colorama.Style.RESET_ALL} '
|
|
41
|
+
f'Check that your '
|
|
42
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
|
|
43
|
+
f'`image_id`{colorama.Style.RESET_ALL} is correct and '
|
|
44
|
+
f'your container credentials are correct. Image specified '
|
|
45
|
+
f'in your task definition is '
|
|
46
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
|
|
47
|
+
f'`{job.spec.template.spec.containers[0].image}`'
|
|
48
|
+
f'{colorama.Style.RESET_ALL}'
|
|
49
|
+
)
|
|
50
|
+
elif 'BackoffLimitExceeded' == condition.reason:
|
|
51
|
+
raise JobsetError('Job failed with non-zero exit code.')
|
|
52
|
+
logger.error(
|
|
53
|
+
'Job failed with unknown error. Check jobset status in k8s with '
|
|
54
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
|
|
55
|
+
f'`kubectl get job -o yaml {job.metadata.name}`'
|
|
56
|
+
f'{colorama.Style.RESET_ALL}'
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _wait_for_jobset_start(namespace: str, job_name: str):
|
|
61
|
+
time.sleep(2)
|
|
62
|
+
start = time.time()
|
|
63
|
+
timeout = config.get_nested(
|
|
64
|
+
('kubernetes', 'provision_timeout'),
|
|
65
|
+
default_value=DEFAULT_ATTACH_TIMEOUT,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
while True:
|
|
69
|
+
jobsets = jobset_utils.get_jobset(namespace, job_name)
|
|
70
|
+
assert jobsets is not None, (
|
|
71
|
+
f'Jobset {job_name} ' f'not found in namespace {namespace}'
|
|
72
|
+
)
|
|
73
|
+
if 'status' in jobsets:
|
|
74
|
+
if jobsets['status']['replicatedJobsStatus'][0]['ready']:
|
|
75
|
+
logger.info(
|
|
76
|
+
f'task '
|
|
77
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
|
78
|
+
f'{colorama.Style.RESET_ALL} ready'
|
|
79
|
+
)
|
|
80
|
+
break
|
|
81
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
|
|
82
|
+
return
|
|
83
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
|
|
84
|
+
logger.info(
|
|
85
|
+
f'job '
|
|
86
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
|
87
|
+
f'{colorama.Style.RESET_ALL} '
|
|
88
|
+
f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
|
|
89
|
+
)
|
|
90
|
+
job = jobset_utils.get_job(namespace, job_name)
|
|
91
|
+
_raise_job_error(job)
|
|
92
|
+
return
|
|
93
|
+
if timeout != -1 and time.time() - start > timeout:
|
|
94
|
+
logger.error(
|
|
95
|
+
f'{colorama.Style.BRIGHT}'
|
|
96
|
+
f'{colorama.Fore.RED}Job timed out to schedule.'
|
|
97
|
+
f'{colorama.Style.RESET_ALL}. Deleting job'
|
|
98
|
+
)
|
|
99
|
+
jobset_utils.delete_jobset(namespace, job_name)
|
|
100
|
+
raise JobsetError(
|
|
101
|
+
'Job failed to start within '
|
|
102
|
+
f'timeout of {timeout} seconds. '
|
|
103
|
+
f'Increase or disable timeout '
|
|
104
|
+
f'{colorama.Style.BRIGHT}'
|
|
105
|
+
'`konduktor.provision_timeout: -1`'
|
|
106
|
+
f'{colorama.Style.RESET_ALL}'
|
|
107
|
+
)
|
|
108
|
+
time.sleep(POLL_INTERVAL)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _wait_for_jobset_completion(namespace: str, job_name: str) -> Tuple[bool, str]:
|
|
112
|
+
while True:
|
|
113
|
+
jobsets = jobset_utils.get_jobset(namespace, job_name)
|
|
114
|
+
assert jobsets is not None, (
|
|
115
|
+
f'Jobset {job_name} ' f'not found in namespace {namespace}'
|
|
116
|
+
)
|
|
117
|
+
if jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
|
|
118
|
+
msg = (
|
|
119
|
+
f'task '
|
|
120
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
|
121
|
+
f'{colorama.Style.RESET_ALL} {colorama.Fore.GREEN}'
|
|
122
|
+
f'{colorama.Style.BRIGHT}finished{colorama.Style.RESET_ALL}'
|
|
123
|
+
)
|
|
124
|
+
return True, msg
|
|
125
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
|
|
126
|
+
msg = (
|
|
127
|
+
f'task '
|
|
128
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
|
129
|
+
f'{colorama.Style.RESET_ALL} {colorama.Fore.RED}'
|
|
130
|
+
f'{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
|
|
131
|
+
)
|
|
132
|
+
return False, msg
|
|
133
|
+
time.sleep(POLL_INTERVAL)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class JobsetBackend(backend.Backend):
|
|
137
|
+
def _sync_file_mounts(
|
|
138
|
+
self,
|
|
139
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
|
140
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
|
141
|
+
) -> None:
|
|
142
|
+
"""Syncs files/directories to cloud storage before job launch.
|
|
143
|
+
|
|
144
|
+
This uploads any local files/dirs to cloud storage so they can be downloaded
|
|
145
|
+
by the pods when they start.
|
|
146
|
+
"""
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
def _sync_workdir(self, workdir: str) -> None:
|
|
150
|
+
"""Syncs the working directory to cloud storage before job launch."""
|
|
151
|
+
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
def _post_execute(self) -> None:
|
|
155
|
+
"""
|
|
156
|
+
TODO(asaiacai): add some helpful messages/commands that a user can run
|
|
157
|
+
to inspect the status of their jobset.
|
|
158
|
+
"""
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
def _execute(
|
|
162
|
+
self, task: 'konduktor.Task', detach_run: bool = False, dryrun: bool = False
|
|
163
|
+
) -> Optional[str]:
|
|
164
|
+
"""Executes the task on the cluster. By creating a jobset
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Job id if the task is submitted to the cluster, None otherwise.
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
# we should consider just building an image with the cloud provider
|
|
171
|
+
# sdks baked in. These can initialize and pull files first before
|
|
172
|
+
# the working container starts.
|
|
173
|
+
|
|
174
|
+
# first define the pod spec then create the jobset definition
|
|
175
|
+
pod_spec = pod_utils.create_pod_spec(task)
|
|
176
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
177
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
178
|
+
# TODO(asaiacai): need to set env variables in pod
|
|
179
|
+
jobset_response: Optional[Dict[str, Any]] = jobset_utils.create_jobset(
|
|
180
|
+
namespace,
|
|
181
|
+
task,
|
|
182
|
+
pod_spec['kubernetes']['pod_config'],
|
|
183
|
+
dryrun=dryrun,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if not dryrun and not detach_run:
|
|
187
|
+
with ux_utils.print_exception_no_traceback():
|
|
188
|
+
with rich_utils.safe_status(
|
|
189
|
+
ux_utils.spinner_message(
|
|
190
|
+
'waiting for job to start. ' 'Press Ctrl+C to detach. \n'
|
|
191
|
+
)
|
|
192
|
+
):
|
|
193
|
+
_wait_for_jobset_start(namespace, task.name)
|
|
194
|
+
try:
|
|
195
|
+
assert jobset_response is not None
|
|
196
|
+
log_thread = threading.Thread(
|
|
197
|
+
target=log_utils.tail_logs,
|
|
198
|
+
args=(task.name,),
|
|
199
|
+
daemon=True,
|
|
200
|
+
)
|
|
201
|
+
logger.info('streaming logs...')
|
|
202
|
+
log_thread.start()
|
|
203
|
+
is_success, msg = _wait_for_jobset_completion(namespace, task.name)
|
|
204
|
+
# give the job sometime to flush logs
|
|
205
|
+
log_thread.join(
|
|
206
|
+
timeout=config.get_nested(('logs', 'timeout'), 60.0)
|
|
207
|
+
)
|
|
208
|
+
if not is_success:
|
|
209
|
+
logger.error(msg)
|
|
210
|
+
else:
|
|
211
|
+
logger.info(msg)
|
|
212
|
+
except KeyboardInterrupt:
|
|
213
|
+
logger.info('detaching from log stream...')
|
|
214
|
+
except Exception as err:
|
|
215
|
+
logger.error(
|
|
216
|
+
f'Check if job resources are '
|
|
217
|
+
f'active/queued with '
|
|
218
|
+
f'{colorama.Style.BRIGHT}'
|
|
219
|
+
f'`konduktor status`'
|
|
220
|
+
f'{colorama.Style.RESET_ALL}'
|
|
221
|
+
)
|
|
222
|
+
raise JobsetError(f'error: {err}')
|
|
223
|
+
else:
|
|
224
|
+
logger.info('detaching from run.')
|
|
225
|
+
return task.name
|