konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/execution.py
ADDED
@@ -0,0 +1,444 @@
|
|
1
|
+
"""Execution layer."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import tempfile
|
5
|
+
import typing
|
6
|
+
from typing import Any, Dict, Optional
|
7
|
+
|
8
|
+
import colorama
|
9
|
+
|
10
|
+
if typing.TYPE_CHECKING:
|
11
|
+
import konduktor
|
12
|
+
|
13
|
+
from konduktor import config, constants
|
14
|
+
from konduktor import logging as konduktor_logging
|
15
|
+
from konduktor.backends import JobsetBackend
|
16
|
+
from konduktor.data import constants as storage_constants
|
17
|
+
from konduktor.data import data_utils
|
18
|
+
from konduktor.data import storage as storage_lib
|
19
|
+
from konduktor.utils import common_utils, exceptions, rich_utils, ux_utils
|
20
|
+
|
21
|
+
logger = konduktor_logging.get_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def _execute(
|
25
|
+
task: 'konduktor.Task',
|
26
|
+
dryrun: bool = False,
|
27
|
+
detach_run: bool = False,
|
28
|
+
) -> Optional[str]:
|
29
|
+
"""Execute an task.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
task: konduktor.Task
|
33
|
+
dryrun: bool; if True, only print the provision info (e.g., cluster
|
34
|
+
yaml).
|
35
|
+
stream_logs: bool; whether to stream all tasks' outputs to the client.
|
36
|
+
cluster_name: Name of the cluster to create/reuse. If None,
|
37
|
+
auto-generate a name.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
workload_id: Optional[int]; the job ID of the submitted job. None if the
|
41
|
+
backend is not CloudVmRayBackend, or no job is submitted to
|
42
|
+
the cluster.
|
43
|
+
"""
|
44
|
+
# (asaiacai): in the future we may support more backends but not likely
|
45
|
+
backend = JobsetBackend()
|
46
|
+
# template the commands for syncing the contents within the shell command
|
47
|
+
# initialization of the pod
|
48
|
+
job_name = backend.execute(task, detach_run, dryrun=dryrun)
|
49
|
+
|
50
|
+
if dryrun:
|
51
|
+
logger.info('Dryrun finished.')
|
52
|
+
return None
|
53
|
+
|
54
|
+
# attach to head node output if detach_run is False
|
55
|
+
backend.post_execute()
|
56
|
+
|
57
|
+
return job_name
|
58
|
+
|
59
|
+
|
60
|
+
def launch(
|
61
|
+
task: 'konduktor.Task',
|
62
|
+
dryrun: bool = False,
|
63
|
+
detach_run: bool = False,
|
64
|
+
) -> Optional[str]:
|
65
|
+
"""Launch a task
|
66
|
+
|
67
|
+
Args:
|
68
|
+
task: konduktor.Task
|
69
|
+
dryrun: if True, do not actually launch the task.
|
70
|
+
detach_run: If True, as soon as a job is submitted, return from this
|
71
|
+
function and do not stream execution logs.
|
72
|
+
|
73
|
+
Example:
|
74
|
+
.. code-block:: python
|
75
|
+
|
76
|
+
import konduktor
|
77
|
+
task = konduktor.Task(run='echo hello konduktor')
|
78
|
+
konduktor.launch(task)
|
79
|
+
|
80
|
+
Raises:
|
81
|
+
Other exceptions may be raised depending on the backend.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
workload_id: Optional[str]; the job ID of the submitted job.
|
85
|
+
"""
|
86
|
+
|
87
|
+
maybe_translate_local_file_mounts_and_sync_up(task, 'job')
|
88
|
+
|
89
|
+
return _execute(
|
90
|
+
task=task,
|
91
|
+
dryrun=dryrun,
|
92
|
+
detach_run=detach_run,
|
93
|
+
)
|
94
|
+
|
95
|
+
|
96
|
+
# (maybe translate local file mounts) and (sync up)
|
97
|
+
def maybe_translate_local_file_mounts_and_sync_up(
|
98
|
+
task: 'konduktor.Task', task_type: str
|
99
|
+
) -> None:
|
100
|
+
"""Translates local->VM mounts into Storage->VM, then syncs up any Storage.
|
101
|
+
|
102
|
+
Eagerly syncing up local->Storage ensures Storage->VM would work at task
|
103
|
+
launch time.
|
104
|
+
|
105
|
+
If there are no local source paths to be translated, this function would
|
106
|
+
still sync up any storage mounts with local source paths (which do not
|
107
|
+
undergo translation).
|
108
|
+
|
109
|
+
When jobs.bucket or serve.bucket is not specified, an intermediate storage
|
110
|
+
dedicated for the job is created for the workdir and local file mounts and
|
111
|
+
the storage is deleted when the job finishes. We don't share the storage
|
112
|
+
between jobs, because jobs might have different resources requirements, and
|
113
|
+
sharing storage between jobs may cause egress costs or slower transfer
|
114
|
+
speeds.
|
115
|
+
"""
|
116
|
+
|
117
|
+
# ================================================================
|
118
|
+
# Translate the workdir and local file mounts to cloud file mounts.
|
119
|
+
# ================================================================
|
120
|
+
|
121
|
+
def _sub_path_join(sub_path: Optional[str], path: str) -> str:
|
122
|
+
if sub_path is None:
|
123
|
+
return path
|
124
|
+
return os.path.join(sub_path, path).strip('/')
|
125
|
+
|
126
|
+
# We use uuid to generate a unique run id for the job, so that the bucket/
|
127
|
+
# subdirectory name is unique across different jobs/services.
|
128
|
+
# We should not use common_utils.get_usage_run_id() here, because when
|
129
|
+
# Python API is used, the run id will be the same across multiple
|
130
|
+
# jobs.launch/serve.up calls after the sky is imported.
|
131
|
+
run_id = common_utils.get_usage_run_id()[:4]
|
132
|
+
user_hash = common_utils.get_user_hash()
|
133
|
+
original_file_mounts = task.file_mounts if task.file_mounts else {}
|
134
|
+
original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
|
135
|
+
|
136
|
+
copy_mounts = task.get_local_to_remote_file_mounts()
|
137
|
+
if copy_mounts is None:
|
138
|
+
copy_mounts = {}
|
139
|
+
|
140
|
+
has_local_source_paths_file_mounts = bool(copy_mounts)
|
141
|
+
has_local_source_paths_workdir = task.workdir is not None
|
142
|
+
|
143
|
+
msg = None
|
144
|
+
if has_local_source_paths_workdir and has_local_source_paths_file_mounts:
|
145
|
+
msg = 'workdir and file_mounts with local source paths'
|
146
|
+
elif has_local_source_paths_file_mounts:
|
147
|
+
msg = 'file_mounts with local source paths'
|
148
|
+
elif has_local_source_paths_workdir:
|
149
|
+
msg = 'workdir'
|
150
|
+
if msg:
|
151
|
+
logger.info(
|
152
|
+
ux_utils.starting_message(f'Translating {msg} to ' 'SkyPilot Storage...')
|
153
|
+
)
|
154
|
+
rich_utils.force_update_status(
|
155
|
+
ux_utils.spinner_message(f'Translating {msg} to SkyPilot Storage...')
|
156
|
+
)
|
157
|
+
|
158
|
+
# Get the bucket name for the workdir and file mounts,
|
159
|
+
# we store all these files in same bucket from config.
|
160
|
+
bucket_wth_prefix = config.get_nested((task_type, 'bucket'), None)
|
161
|
+
store_kwargs: Dict[str, Any] = {}
|
162
|
+
if bucket_wth_prefix is None:
|
163
|
+
store_type = sub_path = None
|
164
|
+
storage_account_name = region = None
|
165
|
+
bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
|
166
|
+
username=common_utils.get_cleaned_username(), user_hash=user_hash, id=run_id
|
167
|
+
)
|
168
|
+
else:
|
169
|
+
(store_type, bucket_name, sub_path, storage_account_name, region) = (
|
170
|
+
storage_lib.StoreType.get_fields_from_store_url(bucket_wth_prefix)
|
171
|
+
)
|
172
|
+
if storage_account_name is not None:
|
173
|
+
store_kwargs['storage_account_name'] = storage_account_name
|
174
|
+
if region is not None:
|
175
|
+
store_kwargs['region'] = region
|
176
|
+
# Step 1: Translate the workdir to SkyPilot storage.
|
177
|
+
new_storage_mounts = {}
|
178
|
+
if task.workdir is not None:
|
179
|
+
workdir = task.workdir
|
180
|
+
task.workdir = None
|
181
|
+
if (
|
182
|
+
constants.KONDUKTOR_REMOTE_WORKDIR in original_file_mounts
|
183
|
+
or constants.KONDUKTOR_REMOTE_WORKDIR in original_storage_mounts
|
184
|
+
):
|
185
|
+
raise ValueError(
|
186
|
+
f'Cannot mount {constants.KONDUKTOR_REMOTE_WORKDIR} as both the '
|
187
|
+
'workdir and file_mounts contains it as the target.'
|
188
|
+
)
|
189
|
+
bucket_sub_path = _sub_path_join(
|
190
|
+
sub_path,
|
191
|
+
constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(
|
192
|
+
task_name=task.name, run_id=run_id
|
193
|
+
),
|
194
|
+
)
|
195
|
+
stores = None
|
196
|
+
if store_type is not None:
|
197
|
+
stores = [store_type]
|
198
|
+
|
199
|
+
storage_obj = storage_lib.Storage(
|
200
|
+
name=bucket_name,
|
201
|
+
source=workdir,
|
202
|
+
persistent=False,
|
203
|
+
mode=storage_lib.StorageMode.COPY,
|
204
|
+
stores=stores,
|
205
|
+
# Set `_is_sky_managed` to False when `bucket_with_prefix` is
|
206
|
+
# specified, so that the storage is not deleted when job finishes,
|
207
|
+
# but only the sub path is deleted.
|
208
|
+
# _is_sky_managed=bucket_wth_prefix is None,
|
209
|
+
_is_sky_managed=False,
|
210
|
+
_bucket_sub_path=bucket_sub_path,
|
211
|
+
)
|
212
|
+
new_storage_mounts[constants.KONDUKTOR_REMOTE_WORKDIR] = storage_obj
|
213
|
+
# Check of the existence of the workdir in file_mounts is done in
|
214
|
+
# the task construction.
|
215
|
+
logger.info(
|
216
|
+
f' {colorama.Style.DIM}Workdir: {workdir!r} '
|
217
|
+
f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}'
|
218
|
+
)
|
219
|
+
|
220
|
+
# Step 2: Translate the local file mounts with folder in src to SkyPilot
|
221
|
+
# storage.
|
222
|
+
# TODO(zhwu): Optimize this by:
|
223
|
+
# 1. Use the same bucket for all the mounts.
|
224
|
+
# 2. When the src is the same, use the same bucket.
|
225
|
+
copy_mounts_with_file_in_src = {}
|
226
|
+
for i, (dst, src) in enumerate(copy_mounts.items()):
|
227
|
+
assert task.file_mounts is not None
|
228
|
+
task.file_mounts.pop(dst)
|
229
|
+
if os.path.isfile(os.path.abspath(os.path.expanduser(src))):
|
230
|
+
copy_mounts_with_file_in_src[dst] = src
|
231
|
+
continue
|
232
|
+
bucket_sub_path = _sub_path_join(
|
233
|
+
sub_path,
|
234
|
+
constants.FILE_MOUNTS_SUBPATH.format(
|
235
|
+
task_name=task.name, i=i, run_id=run_id
|
236
|
+
),
|
237
|
+
)
|
238
|
+
stores = None
|
239
|
+
if store_type is not None:
|
240
|
+
stores = [store_type]
|
241
|
+
storage_obj = storage_lib.Storage(
|
242
|
+
name=bucket_name,
|
243
|
+
source=src,
|
244
|
+
persistent=False,
|
245
|
+
mode=storage_lib.StorageMode.COPY,
|
246
|
+
stores=stores,
|
247
|
+
# _is_sky_managed=not bucket_wth_prefix,
|
248
|
+
_is_sky_managed=False,
|
249
|
+
_bucket_sub_path=bucket_sub_path,
|
250
|
+
)
|
251
|
+
new_storage_mounts[dst] = storage_obj
|
252
|
+
logger.info(
|
253
|
+
f' {colorama.Style.DIM}Folder : {src!r} '
|
254
|
+
f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}'
|
255
|
+
)
|
256
|
+
|
257
|
+
# Step 3: Translate local file mounts with file in src to SkyPilot storage.
|
258
|
+
# Hard link the files in src to a temporary directory, and upload folder.
|
259
|
+
file_mounts_tmp_subpath = _sub_path_join(
|
260
|
+
sub_path,
|
261
|
+
constants.FILE_MOUNTS_TMP_SUBPATH.format(task_name=task.name, run_id=run_id),
|
262
|
+
)
|
263
|
+
base_tmp_dir = os.path.expanduser(constants.FILE_MOUNTS_LOCAL_TMP_BASE_PATH)
|
264
|
+
os.makedirs(base_tmp_dir, exist_ok=True)
|
265
|
+
with tempfile.TemporaryDirectory(dir=base_tmp_dir) as temp_path:
|
266
|
+
local_fm_path = os.path.join(
|
267
|
+
temp_path, constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)
|
268
|
+
)
|
269
|
+
os.makedirs(local_fm_path, exist_ok=True)
|
270
|
+
file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
|
271
|
+
task_type
|
272
|
+
)
|
273
|
+
if copy_mounts_with_file_in_src:
|
274
|
+
src_to_file_id = {}
|
275
|
+
for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
|
276
|
+
src_to_file_id[src] = i
|
277
|
+
os.link(
|
278
|
+
os.path.abspath(os.path.expanduser(src)),
|
279
|
+
os.path.join(local_fm_path, f'file-{i}'),
|
280
|
+
)
|
281
|
+
stores = None
|
282
|
+
if store_type is not None:
|
283
|
+
stores = [store_type]
|
284
|
+
storage_obj = storage_lib.Storage(
|
285
|
+
name=bucket_name,
|
286
|
+
source=local_fm_path,
|
287
|
+
persistent=False,
|
288
|
+
# mode=storage_lib.StorageMode.MOUNT
|
289
|
+
mode=storage_lib.StorageMode.COPY,
|
290
|
+
stores=stores,
|
291
|
+
# _is_sky_managed=not bucket_wth_prefix,
|
292
|
+
_is_sky_managed=False,
|
293
|
+
_bucket_sub_path=file_mounts_tmp_subpath,
|
294
|
+
)
|
295
|
+
|
296
|
+
new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj
|
297
|
+
if file_mount_remote_tmp_dir in original_storage_mounts:
|
298
|
+
with ux_utils.print_exception_no_traceback():
|
299
|
+
raise ValueError(
|
300
|
+
'Failed to translate file mounts, due to the default '
|
301
|
+
f'destination {file_mount_remote_tmp_dir} '
|
302
|
+
'being taken.'
|
303
|
+
)
|
304
|
+
sources = list(src_to_file_id.keys())
|
305
|
+
sources_str = '\n '.join(sources)
|
306
|
+
logger.info(
|
307
|
+
f' {colorama.Style.DIM}Files (listed below) '
|
308
|
+
f' -> storage: {bucket_name}:'
|
309
|
+
f'\n {sources_str}{colorama.Style.RESET_ALL}'
|
310
|
+
)
|
311
|
+
|
312
|
+
rich_utils.force_update_status(
|
313
|
+
ux_utils.spinner_message('Uploading translated local files/folders')
|
314
|
+
)
|
315
|
+
task.update_storage_mounts(new_storage_mounts)
|
316
|
+
|
317
|
+
# Step 4: Upload storage from sources
|
318
|
+
# Upload the local source to a bucket. The task will not be executed
|
319
|
+
# locally, so we need to upload the files/folders to the bucket manually
|
320
|
+
# here before sending the task to the remote jobs controller. This will
|
321
|
+
# also upload any storage mounts that are not translated. After
|
322
|
+
# sync_storage_mounts, we will also have file_mounts in the task, but
|
323
|
+
# these aren't used since the storage_mounts for the same paths take
|
324
|
+
# precedence.
|
325
|
+
if task.storage_mounts:
|
326
|
+
# There may be existing (non-translated) storage mounts, so log this
|
327
|
+
# whenever task.storage_mounts is non-empty.
|
328
|
+
rich_utils.force_update_status(
|
329
|
+
ux_utils.spinner_message(
|
330
|
+
'Uploading local sources to storage[/] '
|
331
|
+
'[dim]View storages: sky storage ls'
|
332
|
+
)
|
333
|
+
)
|
334
|
+
try:
|
335
|
+
task.sync_storage_mounts()
|
336
|
+
except (ValueError, exceptions.NoCloudAccessError) as e:
|
337
|
+
if 'No enabled cloud for storage' in str(e) or isinstance(
|
338
|
+
e, exceptions.NoCloudAccessError
|
339
|
+
):
|
340
|
+
data_src = None
|
341
|
+
if has_local_source_paths_file_mounts:
|
342
|
+
data_src = 'file_mounts'
|
343
|
+
if has_local_source_paths_workdir:
|
344
|
+
if data_src:
|
345
|
+
data_src += ' and workdir'
|
346
|
+
else:
|
347
|
+
data_src = 'workdir'
|
348
|
+
store_enabled_clouds = ', '.join(storage_constants.STORE_ENABLED_CLOUDS)
|
349
|
+
with ux_utils.print_exception_no_traceback():
|
350
|
+
raise exceptions.NotSupportedError(
|
351
|
+
f'Unable to use {data_src} - no cloud with object '
|
352
|
+
'store support is enabled. Please enable at least one '
|
353
|
+
'cloud with object store support '
|
354
|
+
f'({store_enabled_clouds}) by running `sky check`, or '
|
355
|
+
f'remove {data_src} from your task.'
|
356
|
+
'\nHint: If you do not have any cloud access, you may '
|
357
|
+
'still download data and code over the network using '
|
358
|
+
'curl or other tools in the `setup` section of the '
|
359
|
+
'task.'
|
360
|
+
) from None
|
361
|
+
|
362
|
+
# Step 5: Add the file download into the file mounts, such as
|
363
|
+
# /original-dst: s3://spot-fm-file-only-bucket-name/file-0
|
364
|
+
new_file_mounts = {}
|
365
|
+
if copy_mounts_with_file_in_src:
|
366
|
+
# file_mount_remote_tmp_dir will only exist when there are files in
|
367
|
+
# the src for copy mounts.
|
368
|
+
storage_obj = task.storage_mounts[file_mount_remote_tmp_dir]
|
369
|
+
assert storage_obj.stores, (storage_obj.__dict__, task.to_yaml_config())
|
370
|
+
curr_store_type = list(storage_obj.stores.keys())[0]
|
371
|
+
store_object = storage_obj.stores[curr_store_type]
|
372
|
+
assert store_object is not None, (storage_obj.__dict__, task.to_yaml_config())
|
373
|
+
bucket_url = storage_lib.StoreType.get_endpoint_url(store_object, bucket_name)
|
374
|
+
bucket_url += f'/{file_mounts_tmp_subpath}'
|
375
|
+
for dst, src in copy_mounts_with_file_in_src.items():
|
376
|
+
file_id = src_to_file_id[src]
|
377
|
+
new_file_mounts[dst] = bucket_url + f'/file-{file_id}'
|
378
|
+
task.update_file_mounts(new_file_mounts)
|
379
|
+
|
380
|
+
# Step 6: Replace the source field that is local path in all storage_mounts
|
381
|
+
# with bucket URI and remove the name field.
|
382
|
+
for storage_obj in task.storage_mounts.values():
|
383
|
+
if storage_obj.source is not None and not data_utils.is_cloud_store_url(
|
384
|
+
storage_obj.source
|
385
|
+
):
|
386
|
+
# Need to replace the local path with bucket URI, and remove the
|
387
|
+
# name field, so that the storage mount can work on the jobs
|
388
|
+
# controller.
|
389
|
+
store_types = list(storage_obj.stores.keys())
|
390
|
+
assert len(store_types) == 1, (
|
391
|
+
'We only support one store type for now.',
|
392
|
+
storage_obj.stores,
|
393
|
+
)
|
394
|
+
curr_store_type = store_types[0]
|
395
|
+
store_object = storage_obj.stores[curr_store_type]
|
396
|
+
assert store_object is not None and storage_obj.name is not None, (
|
397
|
+
store_object,
|
398
|
+
storage_obj.name,
|
399
|
+
)
|
400
|
+
storage_obj.source = storage_lib.StoreType.get_endpoint_url(
|
401
|
+
store_object, storage_obj.name
|
402
|
+
)
|
403
|
+
storage_obj.force_delete = True
|
404
|
+
|
405
|
+
# Step 7: Convert all `MOUNT` mode storages which don't specify a source
|
406
|
+
# to specifying a source. If the source is specified with a local path,
|
407
|
+
# it was handled in step 6.
|
408
|
+
updated_mount_storages = {}
|
409
|
+
for storage_path, storage_obj in task.storage_mounts.items():
|
410
|
+
if storage_obj.mode == storage_lib.StorageMode.MOUNT and not storage_obj.source:
|
411
|
+
# Construct source URL with first store type and storage name
|
412
|
+
# E.g., s3://my-storage-name
|
413
|
+
store_types = list(storage_obj.stores.keys())
|
414
|
+
assert len(store_types) == 1, (
|
415
|
+
'We only support one store type for now.',
|
416
|
+
storage_obj.stores,
|
417
|
+
)
|
418
|
+
curr_store_type = store_types[0]
|
419
|
+
store_object = storage_obj.stores[curr_store_type]
|
420
|
+
assert store_object is not None and storage_obj.name is not None, (
|
421
|
+
store_object,
|
422
|
+
storage_obj.name,
|
423
|
+
)
|
424
|
+
source = storage_lib.StoreType.get_endpoint_url(
|
425
|
+
store_object, storage_obj.name
|
426
|
+
)
|
427
|
+
assert store_object is not None and storage_obj.name is not None, (
|
428
|
+
store_object,
|
429
|
+
storage_obj.name,
|
430
|
+
)
|
431
|
+
new_storage = storage_lib.Storage.from_yaml_config(
|
432
|
+
{
|
433
|
+
'source': source,
|
434
|
+
'persistent': storage_obj.persistent,
|
435
|
+
'mode': storage_lib.StorageMode.MOUNT.value,
|
436
|
+
# We enable force delete to allow the controller to delete
|
437
|
+
# the object store in case persistent is set to False.
|
438
|
+
'_force_delete': True,
|
439
|
+
}
|
440
|
+
)
|
441
|
+
updated_mount_storages[storage_path] = new_storage
|
442
|
+
task.update_storage_mounts(updated_mount_storages)
|
443
|
+
if msg:
|
444
|
+
logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
|
konduktor/kube_client.py
CHANGED
@@ -1,62 +1,158 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
3
|
+
from typing import Any, Callable, Optional
|
2
4
|
|
3
5
|
import kubernetes
|
4
6
|
import urllib3
|
5
7
|
|
6
8
|
from konduktor import logging as konduktor_logging
|
9
|
+
from konduktor.utils import annotations, ux_utils
|
7
10
|
|
8
11
|
logger = konduktor_logging.get_logger(__name__)
|
9
12
|
|
10
13
|
# Timeout to use for API calls
|
11
14
|
API_TIMEOUT = 5
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
def
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
def
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
15
|
+
DEFAULT_NAMESPACE = 'default'
|
16
|
+
DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
|
17
|
+
# The name for the environment variable that stores the in-cluster context name
|
18
|
+
# for Kubernetes clusters. This is used to associate a name with the current
|
19
|
+
# context when running with in-cluster auth. If not set, the context name is
|
20
|
+
# set to DEFAULT_IN_CLUSTER_REGION.
|
21
|
+
IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'
|
22
|
+
|
23
|
+
|
24
|
+
def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
|
25
|
+
for attr_name in dir(obj):
|
26
|
+
attr = getattr(obj, attr_name)
|
27
|
+
# Skip methods starting with '__' since they are invoked through one
|
28
|
+
# of the main methods, which are already decorated.
|
29
|
+
if callable(attr) and not attr_name.startswith('__'):
|
30
|
+
continue
|
31
|
+
return obj
|
32
|
+
|
33
|
+
|
34
|
+
def _api_logging_decorator(logger: str, level: int):
|
35
|
+
"""Decorator to set logging level for API calls.
|
36
|
+
|
37
|
+
This is used to suppress the verbose logging from urllib3 when calls to the
|
38
|
+
Kubernetes API timeout.
|
39
|
+
"""
|
40
|
+
|
41
|
+
def decorated_api(api):
|
42
|
+
def wrapped(*args, **kwargs):
|
43
|
+
obj = api(*args, **kwargs)
|
44
|
+
_decorate_methods(
|
45
|
+
obj, konduktor_logging.set_logging_level(logger, level), 'api_log'
|
46
|
+
)
|
47
|
+
return obj
|
48
|
+
|
49
|
+
return wrapped
|
50
|
+
|
51
|
+
return decorated_api
|
52
|
+
|
53
|
+
|
54
|
+
def _load_config(context: Optional[str] = None):
|
55
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
56
|
+
|
57
|
+
def _load_config_from_kubeconfig(context: Optional[str] = None):
|
58
|
+
try:
|
59
|
+
kubernetes.config.load_kube_config(context=context)
|
60
|
+
except kubernetes.config.config_exception.ConfigException as e:
|
61
|
+
# Check if exception was due to no current-context
|
62
|
+
if 'Expected key current-context' in str(e):
|
63
|
+
err_str = (
|
64
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
65
|
+
'Kubeconfig does not contain any valid context(s).\n'
|
66
|
+
' If you were running a local Kubernetes '
|
67
|
+
'cluster, run `sky local up` to start the cluster.'
|
68
|
+
)
|
69
|
+
else:
|
70
|
+
err_str = (
|
71
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
72
|
+
'Please check if your kubeconfig file exists at '
|
73
|
+
f'~/.kube/config and is valid.'
|
74
|
+
)
|
75
|
+
err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
|
76
|
+
with ux_utils.print_exception_no_traceback():
|
77
|
+
raise ValueError(err_str) from None
|
78
|
+
|
79
|
+
if context == in_cluster_context_name() or context is None:
|
80
|
+
try:
|
81
|
+
# Load in-cluster config if running in a pod and context is None.
|
82
|
+
# Kubernetes set environment variables for service discovery do not
|
83
|
+
# show up in SkyPilot tasks. For now, we work around by using
|
84
|
+
# DNS name instead of environment variables.
|
85
|
+
# See issue: https://github.com/skypilot-org/skypilot/issues/2287
|
86
|
+
os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
|
87
|
+
os.environ['KUBERNETES_SERVICE_PORT'] = '443'
|
88
|
+
kubernetes.config.load_incluster_config()
|
89
|
+
except kubernetes.config.config_exception.ConfigException:
|
90
|
+
_load_config_from_kubeconfig()
|
91
|
+
else:
|
92
|
+
_load_config_from_kubeconfig(context)
|
93
|
+
|
94
|
+
|
95
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
96
|
+
@annotations.lru_cache(scope='request')
|
97
|
+
def core_api(context: Optional[str] = None):
|
98
|
+
_load_config(context)
|
99
|
+
return kubernetes.client.CoreV1Api()
|
100
|
+
|
101
|
+
|
102
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
103
|
+
@annotations.lru_cache(scope='request')
|
104
|
+
def auth_api(context: Optional[str] = None):
|
105
|
+
_load_config(context)
|
106
|
+
return kubernetes.client.RbacAuthorizationV1Api()
|
107
|
+
|
108
|
+
|
109
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
110
|
+
@annotations.lru_cache(scope='request')
|
111
|
+
def networking_api(context: Optional[str] = None):
|
112
|
+
_load_config(context)
|
113
|
+
return kubernetes.client.NetworkingV1Api()
|
114
|
+
|
115
|
+
|
116
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
117
|
+
@annotations.lru_cache(scope='request')
|
118
|
+
def crd_api(context: Optional[str] = None):
|
119
|
+
_load_config(context)
|
120
|
+
return kubernetes.client.CustomObjectsApi()
|
121
|
+
|
122
|
+
|
123
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
124
|
+
@annotations.lru_cache(scope='request')
|
125
|
+
def node_api(context: Optional[str] = None):
|
126
|
+
_load_config(context)
|
127
|
+
return kubernetes.client.NodeV1Api()
|
128
|
+
|
129
|
+
|
130
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
131
|
+
@annotations.lru_cache(scope='request')
|
132
|
+
def apps_api(context: Optional[str] = None):
|
133
|
+
_load_config(context)
|
134
|
+
return kubernetes.client.AppsV1Api()
|
135
|
+
|
136
|
+
|
137
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
138
|
+
@annotations.lru_cache(scope='request')
|
139
|
+
def api_client(context: Optional[str] = None):
|
140
|
+
_load_config(context)
|
141
|
+
return kubernetes.client.ApiClient()
|
142
|
+
|
143
|
+
|
144
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
145
|
+
@annotations.lru_cache(scope='request')
|
146
|
+
def batch_api(context: Optional[str] = None):
|
147
|
+
_load_config(context)
|
148
|
+
return kubernetes.client.BatchV1Api()
|
149
|
+
|
150
|
+
|
151
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
152
|
+
@annotations.lru_cache(scope='request')
|
153
|
+
def crd_client(context: Optional[str] = None):
|
154
|
+
_load_config(context)
|
155
|
+
return kubernetes.client.CustomObjectsApi()
|
60
156
|
|
61
157
|
|
62
158
|
def api_exception():
|
@@ -73,3 +169,12 @@ def max_retry_error():
|
|
73
169
|
|
74
170
|
def stream():
|
75
171
|
return kubernetes.stream.stream
|
172
|
+
|
173
|
+
|
174
|
+
def in_cluster_context_name() -> Optional[str]:
|
175
|
+
"""Returns the name of the in-cluster context from the environment.
|
176
|
+
|
177
|
+
If the environment variable is not set, returns the default in-cluster
|
178
|
+
context name.
|
179
|
+
"""
|
180
|
+
return os.environ.get(IN_CLUSTER_CONTEXT_NAME_ENV_VAR) or DEFAULT_IN_CLUSTER_REGION
|