konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/execution.py ADDED
@@ -0,0 +1,444 @@
1
+ """Execution layer."""
2
+
3
+ import os
4
+ import tempfile
5
+ import typing
6
+ from typing import Any, Dict, Optional
7
+
8
+ import colorama
9
+
10
+ if typing.TYPE_CHECKING:
11
+ import konduktor
12
+
13
+ from konduktor import config, constants
14
+ from konduktor import logging as konduktor_logging
15
+ from konduktor.backends import JobsetBackend
16
+ from konduktor.data import constants as storage_constants
17
+ from konduktor.data import data_utils
18
+ from konduktor.data import storage as storage_lib
19
+ from konduktor.utils import common_utils, exceptions, rich_utils, ux_utils
20
+
21
+ logger = konduktor_logging.get_logger(__name__)
22
+
23
+
24
+ def _execute(
25
+ task: 'konduktor.Task',
26
+ dryrun: bool = False,
27
+ detach_run: bool = False,
28
+ ) -> Optional[str]:
29
+ """Execute an task.
30
+
31
+ Args:
32
+ task: konduktor.Task
33
+ dryrun: bool; if True, only print the provision info (e.g., cluster
34
+ yaml).
35
+ stream_logs: bool; whether to stream all tasks' outputs to the client.
36
+ cluster_name: Name of the cluster to create/reuse. If None,
37
+ auto-generate a name.
38
+
39
+ Returns:
40
+ workload_id: Optional[int]; the job ID of the submitted job. None if the
41
+ backend is not CloudVmRayBackend, or no job is submitted to
42
+ the cluster.
43
+ """
44
+ # (asaiacai): in the future we may support more backends but not likely
45
+ backend = JobsetBackend()
46
+ # template the commands for syncing the contents within the shell command
47
+ # initialization of the pod
48
+ job_name = backend.execute(task, detach_run, dryrun=dryrun)
49
+
50
+ if dryrun:
51
+ logger.info('Dryrun finished.')
52
+ return None
53
+
54
+ # attach to head node output if detach_run is False
55
+ backend.post_execute()
56
+
57
+ return job_name
58
+
59
+
60
+ def launch(
61
+ task: 'konduktor.Task',
62
+ dryrun: bool = False,
63
+ detach_run: bool = False,
64
+ ) -> Optional[str]:
65
+ """Launch a task
66
+
67
+ Args:
68
+ task: konduktor.Task
69
+ dryrun: if True, do not actually launch the task.
70
+ detach_run: If True, as soon as a job is submitted, return from this
71
+ function and do not stream execution logs.
72
+
73
+ Example:
74
+ .. code-block:: python
75
+
76
+ import konduktor
77
+ task = konduktor.Task(run='echo hello konduktor')
78
+ konduktor.launch(task)
79
+
80
+ Raises:
81
+ Other exceptions may be raised depending on the backend.
82
+
83
+ Returns:
84
+ workload_id: Optional[str]; the job ID of the submitted job.
85
+ """
86
+
87
+ maybe_translate_local_file_mounts_and_sync_up(task, 'job')
88
+
89
+ return _execute(
90
+ task=task,
91
+ dryrun=dryrun,
92
+ detach_run=detach_run,
93
+ )
94
+
95
+
96
+ # (maybe translate local file mounts) and (sync up)
97
+ def maybe_translate_local_file_mounts_and_sync_up(
98
+ task: 'konduktor.Task', task_type: str
99
+ ) -> None:
100
+ """Translates local->VM mounts into Storage->VM, then syncs up any Storage.
101
+
102
+ Eagerly syncing up local->Storage ensures Storage->VM would work at task
103
+ launch time.
104
+
105
+ If there are no local source paths to be translated, this function would
106
+ still sync up any storage mounts with local source paths (which do not
107
+ undergo translation).
108
+
109
+ When jobs.bucket or serve.bucket is not specified, an intermediate storage
110
+ dedicated for the job is created for the workdir and local file mounts and
111
+ the storage is deleted when the job finishes. We don't share the storage
112
+ between jobs, because jobs might have different resources requirements, and
113
+ sharing storage between jobs may cause egress costs or slower transfer
114
+ speeds.
115
+ """
116
+
117
+ # ================================================================
118
+ # Translate the workdir and local file mounts to cloud file mounts.
119
+ # ================================================================
120
+
121
+ def _sub_path_join(sub_path: Optional[str], path: str) -> str:
122
+ if sub_path is None:
123
+ return path
124
+ return os.path.join(sub_path, path).strip('/')
125
+
126
+ # We use uuid to generate a unique run id for the job, so that the bucket/
127
+ # subdirectory name is unique across different jobs/services.
128
+ # We should not use common_utils.get_usage_run_id() here, because when
129
+ # Python API is used, the run id will be the same across multiple
130
+ # jobs.launch/serve.up calls after the sky is imported.
131
+ run_id = common_utils.get_usage_run_id()[:4]
132
+ user_hash = common_utils.get_user_hash()
133
+ original_file_mounts = task.file_mounts if task.file_mounts else {}
134
+ original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
135
+
136
+ copy_mounts = task.get_local_to_remote_file_mounts()
137
+ if copy_mounts is None:
138
+ copy_mounts = {}
139
+
140
+ has_local_source_paths_file_mounts = bool(copy_mounts)
141
+ has_local_source_paths_workdir = task.workdir is not None
142
+
143
+ msg = None
144
+ if has_local_source_paths_workdir and has_local_source_paths_file_mounts:
145
+ msg = 'workdir and file_mounts with local source paths'
146
+ elif has_local_source_paths_file_mounts:
147
+ msg = 'file_mounts with local source paths'
148
+ elif has_local_source_paths_workdir:
149
+ msg = 'workdir'
150
+ if msg:
151
+ logger.info(
152
+ ux_utils.starting_message(f'Translating {msg} to ' 'SkyPilot Storage...')
153
+ )
154
+ rich_utils.force_update_status(
155
+ ux_utils.spinner_message(f'Translating {msg} to SkyPilot Storage...')
156
+ )
157
+
158
+ # Get the bucket name for the workdir and file mounts,
159
+ # we store all these files in same bucket from config.
160
+ bucket_wth_prefix = config.get_nested((task_type, 'bucket'), None)
161
+ store_kwargs: Dict[str, Any] = {}
162
+ if bucket_wth_prefix is None:
163
+ store_type = sub_path = None
164
+ storage_account_name = region = None
165
+ bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
166
+ username=common_utils.get_cleaned_username(), user_hash=user_hash, id=run_id
167
+ )
168
+ else:
169
+ (store_type, bucket_name, sub_path, storage_account_name, region) = (
170
+ storage_lib.StoreType.get_fields_from_store_url(bucket_wth_prefix)
171
+ )
172
+ if storage_account_name is not None:
173
+ store_kwargs['storage_account_name'] = storage_account_name
174
+ if region is not None:
175
+ store_kwargs['region'] = region
176
+ # Step 1: Translate the workdir to SkyPilot storage.
177
+ new_storage_mounts = {}
178
+ if task.workdir is not None:
179
+ workdir = task.workdir
180
+ task.workdir = None
181
+ if (
182
+ constants.KONDUKTOR_REMOTE_WORKDIR in original_file_mounts
183
+ or constants.KONDUKTOR_REMOTE_WORKDIR in original_storage_mounts
184
+ ):
185
+ raise ValueError(
186
+ f'Cannot mount {constants.KONDUKTOR_REMOTE_WORKDIR} as both the '
187
+ 'workdir and file_mounts contains it as the target.'
188
+ )
189
+ bucket_sub_path = _sub_path_join(
190
+ sub_path,
191
+ constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(
192
+ task_name=task.name, run_id=run_id
193
+ ),
194
+ )
195
+ stores = None
196
+ if store_type is not None:
197
+ stores = [store_type]
198
+
199
+ storage_obj = storage_lib.Storage(
200
+ name=bucket_name,
201
+ source=workdir,
202
+ persistent=False,
203
+ mode=storage_lib.StorageMode.COPY,
204
+ stores=stores,
205
+ # Set `_is_sky_managed` to False when `bucket_with_prefix` is
206
+ # specified, so that the storage is not deleted when job finishes,
207
+ # but only the sub path is deleted.
208
+ # _is_sky_managed=bucket_wth_prefix is None,
209
+ _is_sky_managed=False,
210
+ _bucket_sub_path=bucket_sub_path,
211
+ )
212
+ new_storage_mounts[constants.KONDUKTOR_REMOTE_WORKDIR] = storage_obj
213
+ # Check of the existence of the workdir in file_mounts is done in
214
+ # the task construction.
215
+ logger.info(
216
+ f' {colorama.Style.DIM}Workdir: {workdir!r} '
217
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}'
218
+ )
219
+
220
+ # Step 2: Translate the local file mounts with folder in src to SkyPilot
221
+ # storage.
222
+ # TODO(zhwu): Optimize this by:
223
+ # 1. Use the same bucket for all the mounts.
224
+ # 2. When the src is the same, use the same bucket.
225
+ copy_mounts_with_file_in_src = {}
226
+ for i, (dst, src) in enumerate(copy_mounts.items()):
227
+ assert task.file_mounts is not None
228
+ task.file_mounts.pop(dst)
229
+ if os.path.isfile(os.path.abspath(os.path.expanduser(src))):
230
+ copy_mounts_with_file_in_src[dst] = src
231
+ continue
232
+ bucket_sub_path = _sub_path_join(
233
+ sub_path,
234
+ constants.FILE_MOUNTS_SUBPATH.format(
235
+ task_name=task.name, i=i, run_id=run_id
236
+ ),
237
+ )
238
+ stores = None
239
+ if store_type is not None:
240
+ stores = [store_type]
241
+ storage_obj = storage_lib.Storage(
242
+ name=bucket_name,
243
+ source=src,
244
+ persistent=False,
245
+ mode=storage_lib.StorageMode.COPY,
246
+ stores=stores,
247
+ # _is_sky_managed=not bucket_wth_prefix,
248
+ _is_sky_managed=False,
249
+ _bucket_sub_path=bucket_sub_path,
250
+ )
251
+ new_storage_mounts[dst] = storage_obj
252
+ logger.info(
253
+ f' {colorama.Style.DIM}Folder : {src!r} '
254
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}'
255
+ )
256
+
257
+ # Step 3: Translate local file mounts with file in src to SkyPilot storage.
258
+ # Hard link the files in src to a temporary directory, and upload folder.
259
+ file_mounts_tmp_subpath = _sub_path_join(
260
+ sub_path,
261
+ constants.FILE_MOUNTS_TMP_SUBPATH.format(task_name=task.name, run_id=run_id),
262
+ )
263
+ base_tmp_dir = os.path.expanduser(constants.FILE_MOUNTS_LOCAL_TMP_BASE_PATH)
264
+ os.makedirs(base_tmp_dir, exist_ok=True)
265
+ with tempfile.TemporaryDirectory(dir=base_tmp_dir) as temp_path:
266
+ local_fm_path = os.path.join(
267
+ temp_path, constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)
268
+ )
269
+ os.makedirs(local_fm_path, exist_ok=True)
270
+ file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
271
+ task_type
272
+ )
273
+ if copy_mounts_with_file_in_src:
274
+ src_to_file_id = {}
275
+ for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
276
+ src_to_file_id[src] = i
277
+ os.link(
278
+ os.path.abspath(os.path.expanduser(src)),
279
+ os.path.join(local_fm_path, f'file-{i}'),
280
+ )
281
+ stores = None
282
+ if store_type is not None:
283
+ stores = [store_type]
284
+ storage_obj = storage_lib.Storage(
285
+ name=bucket_name,
286
+ source=local_fm_path,
287
+ persistent=False,
288
+ # mode=storage_lib.StorageMode.MOUNT
289
+ mode=storage_lib.StorageMode.COPY,
290
+ stores=stores,
291
+ # _is_sky_managed=not bucket_wth_prefix,
292
+ _is_sky_managed=False,
293
+ _bucket_sub_path=file_mounts_tmp_subpath,
294
+ )
295
+
296
+ new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj
297
+ if file_mount_remote_tmp_dir in original_storage_mounts:
298
+ with ux_utils.print_exception_no_traceback():
299
+ raise ValueError(
300
+ 'Failed to translate file mounts, due to the default '
301
+ f'destination {file_mount_remote_tmp_dir} '
302
+ 'being taken.'
303
+ )
304
+ sources = list(src_to_file_id.keys())
305
+ sources_str = '\n '.join(sources)
306
+ logger.info(
307
+ f' {colorama.Style.DIM}Files (listed below) '
308
+ f' -> storage: {bucket_name}:'
309
+ f'\n {sources_str}{colorama.Style.RESET_ALL}'
310
+ )
311
+
312
+ rich_utils.force_update_status(
313
+ ux_utils.spinner_message('Uploading translated local files/folders')
314
+ )
315
+ task.update_storage_mounts(new_storage_mounts)
316
+
317
+ # Step 4: Upload storage from sources
318
+ # Upload the local source to a bucket. The task will not be executed
319
+ # locally, so we need to upload the files/folders to the bucket manually
320
+ # here before sending the task to the remote jobs controller. This will
321
+ # also upload any storage mounts that are not translated. After
322
+ # sync_storage_mounts, we will also have file_mounts in the task, but
323
+ # these aren't used since the storage_mounts for the same paths take
324
+ # precedence.
325
+ if task.storage_mounts:
326
+ # There may be existing (non-translated) storage mounts, so log this
327
+ # whenever task.storage_mounts is non-empty.
328
+ rich_utils.force_update_status(
329
+ ux_utils.spinner_message(
330
+ 'Uploading local sources to storage[/] '
331
+ '[dim]View storages: sky storage ls'
332
+ )
333
+ )
334
+ try:
335
+ task.sync_storage_mounts()
336
+ except (ValueError, exceptions.NoCloudAccessError) as e:
337
+ if 'No enabled cloud for storage' in str(e) or isinstance(
338
+ e, exceptions.NoCloudAccessError
339
+ ):
340
+ data_src = None
341
+ if has_local_source_paths_file_mounts:
342
+ data_src = 'file_mounts'
343
+ if has_local_source_paths_workdir:
344
+ if data_src:
345
+ data_src += ' and workdir'
346
+ else:
347
+ data_src = 'workdir'
348
+ store_enabled_clouds = ', '.join(storage_constants.STORE_ENABLED_CLOUDS)
349
+ with ux_utils.print_exception_no_traceback():
350
+ raise exceptions.NotSupportedError(
351
+ f'Unable to use {data_src} - no cloud with object '
352
+ 'store support is enabled. Please enable at least one '
353
+ 'cloud with object store support '
354
+ f'({store_enabled_clouds}) by running `sky check`, or '
355
+ f'remove {data_src} from your task.'
356
+ '\nHint: If you do not have any cloud access, you may '
357
+ 'still download data and code over the network using '
358
+ 'curl or other tools in the `setup` section of the '
359
+ 'task.'
360
+ ) from None
361
+
362
+ # Step 5: Add the file download into the file mounts, such as
363
+ # /original-dst: s3://spot-fm-file-only-bucket-name/file-0
364
+ new_file_mounts = {}
365
+ if copy_mounts_with_file_in_src:
366
+ # file_mount_remote_tmp_dir will only exist when there are files in
367
+ # the src for copy mounts.
368
+ storage_obj = task.storage_mounts[file_mount_remote_tmp_dir]
369
+ assert storage_obj.stores, (storage_obj.__dict__, task.to_yaml_config())
370
+ curr_store_type = list(storage_obj.stores.keys())[0]
371
+ store_object = storage_obj.stores[curr_store_type]
372
+ assert store_object is not None, (storage_obj.__dict__, task.to_yaml_config())
373
+ bucket_url = storage_lib.StoreType.get_endpoint_url(store_object, bucket_name)
374
+ bucket_url += f'/{file_mounts_tmp_subpath}'
375
+ for dst, src in copy_mounts_with_file_in_src.items():
376
+ file_id = src_to_file_id[src]
377
+ new_file_mounts[dst] = bucket_url + f'/file-{file_id}'
378
+ task.update_file_mounts(new_file_mounts)
379
+
380
+ # Step 6: Replace the source field that is local path in all storage_mounts
381
+ # with bucket URI and remove the name field.
382
+ for storage_obj in task.storage_mounts.values():
383
+ if storage_obj.source is not None and not data_utils.is_cloud_store_url(
384
+ storage_obj.source
385
+ ):
386
+ # Need to replace the local path with bucket URI, and remove the
387
+ # name field, so that the storage mount can work on the jobs
388
+ # controller.
389
+ store_types = list(storage_obj.stores.keys())
390
+ assert len(store_types) == 1, (
391
+ 'We only support one store type for now.',
392
+ storage_obj.stores,
393
+ )
394
+ curr_store_type = store_types[0]
395
+ store_object = storage_obj.stores[curr_store_type]
396
+ assert store_object is not None and storage_obj.name is not None, (
397
+ store_object,
398
+ storage_obj.name,
399
+ )
400
+ storage_obj.source = storage_lib.StoreType.get_endpoint_url(
401
+ store_object, storage_obj.name
402
+ )
403
+ storage_obj.force_delete = True
404
+
405
+ # Step 7: Convert all `MOUNT` mode storages which don't specify a source
406
+ # to specifying a source. If the source is specified with a local path,
407
+ # it was handled in step 6.
408
+ updated_mount_storages = {}
409
+ for storage_path, storage_obj in task.storage_mounts.items():
410
+ if storage_obj.mode == storage_lib.StorageMode.MOUNT and not storage_obj.source:
411
+ # Construct source URL with first store type and storage name
412
+ # E.g., s3://my-storage-name
413
+ store_types = list(storage_obj.stores.keys())
414
+ assert len(store_types) == 1, (
415
+ 'We only support one store type for now.',
416
+ storage_obj.stores,
417
+ )
418
+ curr_store_type = store_types[0]
419
+ store_object = storage_obj.stores[curr_store_type]
420
+ assert store_object is not None and storage_obj.name is not None, (
421
+ store_object,
422
+ storage_obj.name,
423
+ )
424
+ source = storage_lib.StoreType.get_endpoint_url(
425
+ store_object, storage_obj.name
426
+ )
427
+ assert store_object is not None and storage_obj.name is not None, (
428
+ store_object,
429
+ storage_obj.name,
430
+ )
431
+ new_storage = storage_lib.Storage.from_yaml_config(
432
+ {
433
+ 'source': source,
434
+ 'persistent': storage_obj.persistent,
435
+ 'mode': storage_lib.StorageMode.MOUNT.value,
436
+ # We enable force delete to allow the controller to delete
437
+ # the object store in case persistent is set to False.
438
+ '_force_delete': True,
439
+ }
440
+ )
441
+ updated_mount_storages[storage_path] = new_storage
442
+ task.update_storage_mounts(updated_mount_storages)
443
+ if msg:
444
+ logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
konduktor/kube_client.py CHANGED
@@ -1,62 +1,158 @@
1
+ import logging
1
2
  import os
3
+ from typing import Any, Callable, Optional
2
4
 
3
5
  import kubernetes
4
6
  import urllib3
5
7
 
6
8
  from konduktor import logging as konduktor_logging
9
+ from konduktor.utils import annotations, ux_utils
7
10
 
8
11
  logger = konduktor_logging.get_logger(__name__)
9
12
 
10
13
  # Timeout to use for API calls
11
14
  API_TIMEOUT = 5
12
-
13
- _configured = False
14
- _core_api = None
15
-
16
- # For dashboard
17
- _batch_api = None
18
- _crd_api = None
19
-
20
-
21
- def _load_config():
22
- global _configured
23
- if _configured:
24
- return
25
- try:
26
- os.environ["KUBERNETES_SERVICE_HOST"] = "kubernetes.default.svc"
27
- os.environ["KUBERNETES_SERVICE_PORT"] = "443"
28
- kubernetes.config.load_incluster_config()
29
- logger.info("incluster k8s config loaded")
30
- except kubernetes.config.config_exception.ConfigException:
31
- # this should really only be loaded for debugging.
32
- logger.warning("incluster config failed to load, attempting to use kubeconfig.")
33
- kubernetes.config.load_kube_config()
34
- logger.info("KUBECONFIG loaded")
35
- _configured = True
36
-
37
-
38
- def core_api():
39
- global _core_api
40
- if _core_api is None:
41
- _load_config()
42
- _core_api = kubernetes.client.CoreV1Api()
43
- return _core_api
44
-
45
-
46
- def batch_api():
47
- global _batch_api
48
- if _batch_api is None:
49
- _load_config()
50
- _batch_api = kubernetes.client.BatchV1Api()
51
- return _batch_api
52
-
53
-
54
- def crd_api():
55
- global _crd_api
56
- if _crd_api is None:
57
- _load_config()
58
- _crd_api = kubernetes.client.CustomObjectsApi()
59
- return _crd_api
15
+ DEFAULT_NAMESPACE = 'default'
16
+ DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
17
+ # The name for the environment variable that stores the in-cluster context name
18
+ # for Kubernetes clusters. This is used to associate a name with the current
19
+ # context when running with in-cluster auth. If not set, the context name is
20
+ # set to DEFAULT_IN_CLUSTER_REGION.
21
+ IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'
22
+
23
+
24
+ def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
25
+ for attr_name in dir(obj):
26
+ attr = getattr(obj, attr_name)
27
+ # Skip methods starting with '__' since they are invoked through one
28
+ # of the main methods, which are already decorated.
29
+ if callable(attr) and not attr_name.startswith('__'):
30
+ continue
31
+ return obj
32
+
33
+
34
+ def _api_logging_decorator(logger: str, level: int):
35
+ """Decorator to set logging level for API calls.
36
+
37
+ This is used to suppress the verbose logging from urllib3 when calls to the
38
+ Kubernetes API timeout.
39
+ """
40
+
41
+ def decorated_api(api):
42
+ def wrapped(*args, **kwargs):
43
+ obj = api(*args, **kwargs)
44
+ _decorate_methods(
45
+ obj, konduktor_logging.set_logging_level(logger, level), 'api_log'
46
+ )
47
+ return obj
48
+
49
+ return wrapped
50
+
51
+ return decorated_api
52
+
53
+
54
+ def _load_config(context: Optional[str] = None):
55
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
56
+
57
+ def _load_config_from_kubeconfig(context: Optional[str] = None):
58
+ try:
59
+ kubernetes.config.load_kube_config(context=context)
60
+ except kubernetes.config.config_exception.ConfigException as e:
61
+ # Check if exception was due to no current-context
62
+ if 'Expected key current-context' in str(e):
63
+ err_str = (
64
+ f'Failed to load Kubernetes configuration for {context!r}. '
65
+ 'Kubeconfig does not contain any valid context(s).\n'
66
+ ' If you were running a local Kubernetes '
67
+ 'cluster, run `sky local up` to start the cluster.'
68
+ )
69
+ else:
70
+ err_str = (
71
+ f'Failed to load Kubernetes configuration for {context!r}. '
72
+ 'Please check if your kubeconfig file exists at '
73
+ f'~/.kube/config and is valid.'
74
+ )
75
+ err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
76
+ with ux_utils.print_exception_no_traceback():
77
+ raise ValueError(err_str) from None
78
+
79
+ if context == in_cluster_context_name() or context is None:
80
+ try:
81
+ # Load in-cluster config if running in a pod and context is None.
82
+ # Kubernetes set environment variables for service discovery do not
83
+ # show up in SkyPilot tasks. For now, we work around by using
84
+ # DNS name instead of environment variables.
85
+ # See issue: https://github.com/skypilot-org/skypilot/issues/2287
86
+ os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
87
+ os.environ['KUBERNETES_SERVICE_PORT'] = '443'
88
+ kubernetes.config.load_incluster_config()
89
+ except kubernetes.config.config_exception.ConfigException:
90
+ _load_config_from_kubeconfig()
91
+ else:
92
+ _load_config_from_kubeconfig(context)
93
+
94
+
95
+ @_api_logging_decorator('urllib3', logging.ERROR)
96
+ @annotations.lru_cache(scope='request')
97
+ def core_api(context: Optional[str] = None):
98
+ _load_config(context)
99
+ return kubernetes.client.CoreV1Api()
100
+
101
+
102
+ @_api_logging_decorator('urllib3', logging.ERROR)
103
+ @annotations.lru_cache(scope='request')
104
+ def auth_api(context: Optional[str] = None):
105
+ _load_config(context)
106
+ return kubernetes.client.RbacAuthorizationV1Api()
107
+
108
+
109
+ @_api_logging_decorator('urllib3', logging.ERROR)
110
+ @annotations.lru_cache(scope='request')
111
+ def networking_api(context: Optional[str] = None):
112
+ _load_config(context)
113
+ return kubernetes.client.NetworkingV1Api()
114
+
115
+
116
+ @_api_logging_decorator('urllib3', logging.ERROR)
117
+ @annotations.lru_cache(scope='request')
118
+ def crd_api(context: Optional[str] = None):
119
+ _load_config(context)
120
+ return kubernetes.client.CustomObjectsApi()
121
+
122
+
123
+ @_api_logging_decorator('urllib3', logging.ERROR)
124
+ @annotations.lru_cache(scope='request')
125
+ def node_api(context: Optional[str] = None):
126
+ _load_config(context)
127
+ return kubernetes.client.NodeV1Api()
128
+
129
+
130
+ @_api_logging_decorator('urllib3', logging.ERROR)
131
+ @annotations.lru_cache(scope='request')
132
+ def apps_api(context: Optional[str] = None):
133
+ _load_config(context)
134
+ return kubernetes.client.AppsV1Api()
135
+
136
+
137
+ @_api_logging_decorator('urllib3', logging.ERROR)
138
+ @annotations.lru_cache(scope='request')
139
+ def api_client(context: Optional[str] = None):
140
+ _load_config(context)
141
+ return kubernetes.client.ApiClient()
142
+
143
+
144
+ @_api_logging_decorator('urllib3', logging.ERROR)
145
+ @annotations.lru_cache(scope='request')
146
+ def batch_api(context: Optional[str] = None):
147
+ _load_config(context)
148
+ return kubernetes.client.BatchV1Api()
149
+
150
+
151
+ @_api_logging_decorator('urllib3', logging.ERROR)
152
+ @annotations.lru_cache(scope='request')
153
+ def crd_client(context: Optional[str] = None):
154
+ _load_config(context)
155
+ return kubernetes.client.CustomObjectsApi()
60
156
 
61
157
 
62
158
  def api_exception():
@@ -73,3 +169,12 @@ def max_retry_error():
73
169
 
74
170
  def stream():
75
171
  return kubernetes.stream.stream
172
+
173
+
174
+ def in_cluster_context_name() -> Optional[str]:
175
+ """Returns the name of the in-cluster context from the environment.
176
+
177
+ If the environment variable is not set, returns the default in-cluster
178
+ context name.
179
+ """
180
+ return os.environ.get(IN_CLUSTER_CONTEXT_NAME_ENV_VAR) or DEFAULT_IN_CLUSTER_REGION