konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/execution.py ADDED
@@ -0,0 +1,447 @@
1
+ """Execution layer."""
2
+
3
+ import os
4
+ import tempfile
5
+ import typing
6
+ from typing import Any, Dict, Optional
7
+
8
+ import colorama
9
+
10
+ if typing.TYPE_CHECKING:
11
+ import konduktor
12
+
13
+ from konduktor import config, constants
14
+ from konduktor import logging as konduktor_logging
15
+ from konduktor.backends import DeploymentBackend, JobsetBackend
16
+ from konduktor.data import data_utils
17
+ from konduktor.data import registry as storage_registry
18
+ from konduktor.data import storage as storage_lib
19
+ from konduktor.utils import common_utils, exceptions, rich_utils, ux_utils
20
+
21
+ logger = konduktor_logging.get_logger(__name__)
22
+
23
+
24
+ def _execute(
25
+ task: 'konduktor.Task',
26
+ dryrun: bool = False,
27
+ detach_run: bool = False,
28
+ ) -> Optional[str]:
29
+ """Execute an task.
30
+
31
+ Args:
32
+ task: konduktor.Task
33
+ dryrun: bool; if True, only print the provision info (e.g., cluster
34
+ yaml).
35
+ stream_logs: bool; whether to stream all tasks' outputs to the client.
36
+ cluster_name: Name of the cluster to create/reuse. If None,
37
+ auto-generate a name.
38
+
39
+ Returns:
40
+ workload_id: Optional[int]; the job ID of the submitted job. None if the
41
+ backend is not CloudVmRayBackend, or no job is submitted to
42
+ the cluster.
43
+ """
44
+ # (asaiacai): in the future we may support more backends but not likely
45
+ if task.serving:
46
+ backend = DeploymentBackend() # type: ignore
47
+ else:
48
+ backend = JobsetBackend() # type: ignore
49
+ # template the commands for syncing the contents within the shell command
50
+ # initialization of the pod
51
+ job_name = backend.execute(task, detach_run, dryrun=dryrun)
52
+
53
+ if dryrun:
54
+ logger.info('Dryrun finished.')
55
+ return None
56
+
57
+ # attach to head node output if detach_run is False
58
+ backend.post_execute()
59
+
60
+ return job_name
61
+
62
+
63
+ def launch(
64
+ task: 'konduktor.Task',
65
+ dryrun: bool = False,
66
+ detach_run: bool = False,
67
+ ) -> Optional[str]:
68
+ """Launch a task
69
+
70
+ Args:
71
+ task: konduktor.Task
72
+ dryrun: if True, do not actually launch the task.
73
+ detach_run: If True, as soon as a job is submitted, return from this
74
+ function and do not stream execution logs.
75
+
76
+ Example:
77
+ .. code-block:: python
78
+
79
+ import konduktor
80
+ task = konduktor.Task(run='echo hello konduktor')
81
+ konduktor.launch(task)
82
+
83
+ Raises:
84
+ Other exceptions may be raised depending on the backend.
85
+
86
+ Returns:
87
+ workload_id: Optional[str]; the job ID of the submitted job.
88
+ """
89
+
90
+ maybe_translate_local_file_mounts_and_sync_up(task, 'job')
91
+
92
+ return _execute(
93
+ task=task,
94
+ dryrun=dryrun,
95
+ detach_run=detach_run,
96
+ )
97
+
98
+
99
+ # (maybe translate local file mounts) and (sync up)
100
+ def maybe_translate_local_file_mounts_and_sync_up(
101
+ task: 'konduktor.Task', task_type: str
102
+ ) -> None:
103
+ """Translates local->VM mounts into Storage->VM, then syncs up any Storage.
104
+
105
+ Eagerly syncing up local->Storage ensures Storage->VM would work at task
106
+ launch time.
107
+
108
+ If there are no local source paths to be translated, this function would
109
+ still sync up any storage mounts with local source paths (which do not
110
+ undergo translation).
111
+
112
+ When jobs.bucket or serve.bucket is not specified, an intermediate storage
113
+ dedicated for the job is created for the workdir and local file mounts and
114
+ the storage is deleted when the job finishes. We don't share the storage
115
+ between jobs, because jobs might have different resources requirements, and
116
+ sharing storage between jobs may cause egress costs or slower transfer
117
+ speeds.
118
+ """
119
+
120
+ # ================================================================
121
+ # Translate the workdir and local file mounts to cloud file mounts.
122
+ # ================================================================
123
+
124
+ def _sub_path_join(sub_path: Optional[str], path: str) -> str:
125
+ if sub_path is None:
126
+ return path
127
+ return os.path.join(sub_path, path).strip('/')
128
+
129
+ # We use uuid to generate a unique run id for the job, so that the bucket/
130
+ # subdirectory name is unique across different jobs/services.
131
+ # We should not use common_utils.get_usage_run_id() here, because when
132
+ # Python API is used, the run id will be the same across multiple
133
+ # jobs.launch/serve.up calls after the sky is imported.
134
+ run_id = common_utils.get_usage_run_id()[:4]
135
+ user_hash = common_utils.get_user_hash()
136
+ original_file_mounts = task.file_mounts if task.file_mounts else {}
137
+ original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
138
+
139
+ copy_mounts = task.get_local_to_remote_file_mounts()
140
+ if copy_mounts is None:
141
+ copy_mounts = {}
142
+
143
+ has_local_source_paths_file_mounts = bool(copy_mounts)
144
+ has_local_source_paths_workdir = task.workdir is not None
145
+
146
+ msg = None
147
+ if has_local_source_paths_workdir and has_local_source_paths_file_mounts:
148
+ msg = 'workdir and file_mounts with local source paths'
149
+ elif has_local_source_paths_file_mounts:
150
+ msg = 'file_mounts with local source paths'
151
+ elif has_local_source_paths_workdir:
152
+ msg = 'workdir'
153
+ if msg:
154
+ logger.info(
155
+ ux_utils.starting_message(f'Translating {msg} to ' 'cloud Storage...')
156
+ )
157
+ rich_utils.force_update_status(
158
+ ux_utils.spinner_message(f'Translating {msg} to cloud Storage...')
159
+ )
160
+
161
+ # Get the bucket name for the workdir and file mounts,
162
+ # we store all these files in same bucket from config.
163
+ bucket_wth_prefix = config.get_nested((task_type, 'bucket'), None)
164
+ store_kwargs: Dict[str, Any] = {}
165
+ if bucket_wth_prefix is None:
166
+ store_type = sub_path = None
167
+ storage_account_name = region = None
168
+ bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
169
+ username=common_utils.get_cleaned_username(), user_hash=user_hash, id=run_id
170
+ )
171
+ else:
172
+ (store_type, bucket_name, sub_path, storage_account_name, region) = (
173
+ storage_lib.StoreType.get_fields_from_store_url(bucket_wth_prefix)
174
+ )
175
+ if storage_account_name is not None:
176
+ store_kwargs['storage_account_name'] = storage_account_name
177
+ if region is not None:
178
+ store_kwargs['region'] = region
179
+ # Step 1: Translate the workdir to SkyPilot storage.
180
+ new_storage_mounts = {}
181
+ if task.workdir is not None:
182
+ workdir = task.workdir
183
+ task.workdir = None
184
+ if (
185
+ constants.KONDUKTOR_REMOTE_WORKDIR in original_file_mounts
186
+ or constants.KONDUKTOR_REMOTE_WORKDIR in original_storage_mounts
187
+ ):
188
+ raise ValueError(
189
+ f'Cannot mount {constants.KONDUKTOR_REMOTE_WORKDIR} as both the '
190
+ 'workdir and file_mounts contains it as the target.'
191
+ )
192
+ bucket_sub_path = _sub_path_join(
193
+ sub_path,
194
+ constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(
195
+ task_name=task.name, run_id=run_id
196
+ ),
197
+ )
198
+ stores = None
199
+ if store_type is not None:
200
+ stores = [store_type]
201
+
202
+ storage_obj = storage_lib.Storage(
203
+ name=bucket_name,
204
+ source=workdir,
205
+ persistent=False,
206
+ mode=storage_lib.StorageMode.COPY,
207
+ stores=stores,
208
+ # Set `_is_sky_managed` to False when `bucket_with_prefix` is
209
+ # specified, so that the storage is not deleted when job finishes,
210
+ # but only the sub path is deleted.
211
+ # _is_sky_managed=bucket_wth_prefix is None,
212
+ _is_sky_managed=False,
213
+ _bucket_sub_path=bucket_sub_path,
214
+ )
215
+ new_storage_mounts[constants.KONDUKTOR_REMOTE_WORKDIR] = storage_obj
216
+ # Check of the existence of the workdir in file_mounts is done in
217
+ # the task construction.
218
+ logger.info(
219
+ f' {colorama.Style.DIM}Workdir: {workdir!r} '
220
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}'
221
+ )
222
+
223
+ # Step 2: Translate the local file mounts with folder in src to SkyPilot
224
+ # storage.
225
+ # TODO(zhwu): Optimize this by:
226
+ # 1. Use the same bucket for all the mounts.
227
+ # 2. When the src is the same, use the same bucket.
228
+ copy_mounts_with_file_in_src = {}
229
+ for i, (dst, src) in enumerate(copy_mounts.items()):
230
+ assert task.file_mounts is not None
231
+ task.file_mounts.pop(dst)
232
+ if os.path.isfile(os.path.abspath(os.path.expanduser(src))):
233
+ copy_mounts_with_file_in_src[dst] = src
234
+ continue
235
+ bucket_sub_path = _sub_path_join(
236
+ sub_path,
237
+ constants.FILE_MOUNTS_SUBPATH.format(
238
+ task_name=task.name, i=i, run_id=run_id
239
+ ),
240
+ )
241
+ stores = None
242
+ if store_type is not None:
243
+ stores = [store_type]
244
+ storage_obj = storage_lib.Storage(
245
+ name=bucket_name,
246
+ source=src,
247
+ persistent=False,
248
+ mode=storage_lib.StorageMode.COPY,
249
+ stores=stores,
250
+ # _is_sky_managed=not bucket_wth_prefix,
251
+ _is_sky_managed=False,
252
+ _bucket_sub_path=bucket_sub_path,
253
+ )
254
+ new_storage_mounts[dst] = storage_obj
255
+ logger.info(
256
+ f' {colorama.Style.DIM}Folder : {src!r} '
257
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}'
258
+ )
259
+
260
+ # Step 3: Translate local file mounts with file in src to SkyPilot storage.
261
+ # Hard link the files in src to a temporary directory, and upload folder.
262
+ file_mounts_tmp_subpath = _sub_path_join(
263
+ sub_path,
264
+ constants.FILE_MOUNTS_TMP_SUBPATH.format(task_name=task.name, run_id=run_id),
265
+ )
266
+ base_tmp_dir = os.path.expanduser(constants.FILE_MOUNTS_LOCAL_TMP_BASE_PATH)
267
+ os.makedirs(base_tmp_dir, exist_ok=True)
268
+ with tempfile.TemporaryDirectory(dir=base_tmp_dir) as temp_path:
269
+ local_fm_path = os.path.join(
270
+ temp_path, constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)
271
+ )
272
+ os.makedirs(local_fm_path, exist_ok=True)
273
+ file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
274
+ task_type
275
+ )
276
+ if copy_mounts_with_file_in_src:
277
+ src_to_file_id = {}
278
+ for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
279
+ src_to_file_id[src] = i
280
+ os.link(
281
+ os.path.abspath(os.path.expanduser(src)),
282
+ os.path.join(local_fm_path, f'file-{i}'),
283
+ )
284
+ stores = None
285
+ if store_type is not None:
286
+ stores = [store_type]
287
+ storage_obj = storage_lib.Storage(
288
+ name=bucket_name,
289
+ source=local_fm_path,
290
+ persistent=False,
291
+ # mode=storage_lib.StorageMode.MOUNT
292
+ mode=storage_lib.StorageMode.COPY,
293
+ stores=stores,
294
+ # _is_sky_managed=not bucket_wth_prefix,
295
+ _is_sky_managed=False,
296
+ _bucket_sub_path=file_mounts_tmp_subpath,
297
+ )
298
+
299
+ new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj
300
+ if file_mount_remote_tmp_dir in original_storage_mounts:
301
+ with ux_utils.print_exception_no_traceback():
302
+ raise ValueError(
303
+ 'Failed to translate file mounts, due to the default '
304
+ f'destination {file_mount_remote_tmp_dir} '
305
+ 'being taken.'
306
+ )
307
+ sources = list(src_to_file_id.keys())
308
+ sources_str = '\n '.join(sources)
309
+ logger.info(
310
+ f' {colorama.Style.DIM}Files (listed below) '
311
+ f' -> storage: {bucket_name}:'
312
+ f'\n {sources_str}{colorama.Style.RESET_ALL}'
313
+ )
314
+
315
+ rich_utils.force_update_status(
316
+ ux_utils.spinner_message('Uploading translated local files/folders')
317
+ )
318
+ task.update_storage_mounts(new_storage_mounts)
319
+
320
+ # Step 4: Upload storage from sources
321
+ # Upload the local source to a bucket. The task will not be executed
322
+ # locally, so we need to upload the files/folders to the bucket manually
323
+ # here before sending the task to the remote jobs controller. This will
324
+ # also upload any storage mounts that are not translated. After
325
+ # sync_storage_mounts, we will also have file_mounts in the task, but
326
+ # these aren't used since the storage_mounts for the same paths take
327
+ # precedence.
328
+ if task.storage_mounts:
329
+ # There may be existing (non-translated) storage mounts, so log this
330
+ # whenever task.storage_mounts is non-empty.
331
+ rich_utils.force_update_status(
332
+ ux_utils.spinner_message(
333
+ 'Uploading local sources to storage[/] '
334
+ '[dim]View storages: sky storage ls'
335
+ )
336
+ )
337
+ try:
338
+ task.sync_storage_mounts()
339
+ except (ValueError, exceptions.NoCloudAccessError) as e:
340
+ if 'No enabled cloud for storage' in str(e) or isinstance(
341
+ e, exceptions.NoCloudAccessError
342
+ ):
343
+ data_src = None
344
+ if has_local_source_paths_file_mounts:
345
+ data_src = 'file_mounts'
346
+ if has_local_source_paths_workdir:
347
+ if data_src:
348
+ data_src += ' and workdir'
349
+ else:
350
+ data_src = 'workdir'
351
+ store_enabled_clouds = ', '.join(storage_registry._STORE_ENABLED_CLOUDS)
352
+ with ux_utils.print_exception_no_traceback():
353
+ raise exceptions.NotSupportedError(
354
+ f'Unable to use {data_src} - no cloud with object '
355
+ 'store support is enabled. Please enable at least one '
356
+ 'cloud with object store support '
357
+ f'({store_enabled_clouds}) by running `sky check`, or '
358
+ f'remove {data_src} from your task.'
359
+ '\nHint: If you do not have any cloud access, you may '
360
+ 'still download data and code over the network using '
361
+ 'curl or other tools in the `setup` section of the '
362
+ 'task.'
363
+ ) from None
364
+
365
+ # Step 5: Add the file download into the file mounts, such as
366
+ # /original-dst: s3://spot-fm-file-only-bucket-name/file-0
367
+ new_file_mounts = {}
368
+ if copy_mounts_with_file_in_src:
369
+ # file_mount_remote_tmp_dir will only exist when there are files in
370
+ # the src for copy mounts.
371
+ storage_obj = task.storage_mounts[file_mount_remote_tmp_dir]
372
+ assert storage_obj.stores, (storage_obj.__dict__, task.to_yaml_config())
373
+ curr_store_type = list(storage_obj.stores.keys())[0]
374
+ store_object = storage_obj.stores[curr_store_type]
375
+ assert store_object is not None, (storage_obj.__dict__, task.to_yaml_config())
376
+ bucket_url = storage_lib.StoreType.get_endpoint_url(store_object, bucket_name)
377
+ bucket_url += f'/{file_mounts_tmp_subpath}'
378
+ for dst, src in copy_mounts_with_file_in_src.items():
379
+ file_id = src_to_file_id[src]
380
+ new_file_mounts[dst] = bucket_url + f'/file-{file_id}'
381
+ task.update_file_mounts(new_file_mounts)
382
+
383
+ # Step 6: Replace the source field that is local path in all storage_mounts
384
+ # with bucket URI and remove the name field.
385
+ for storage_obj in task.storage_mounts.values():
386
+ if storage_obj.source is not None and not data_utils.is_cloud_store_url(
387
+ storage_obj.source
388
+ ):
389
+ # Need to replace the local path with bucket URI, and remove the
390
+ # name field, so that the storage mount can work on the jobs
391
+ # controller.
392
+ store_types = list(storage_obj.stores.keys())
393
+ assert len(store_types) == 1, (
394
+ 'We only support one store type for now.',
395
+ storage_obj.stores,
396
+ )
397
+ curr_store_type = store_types[0]
398
+ store_object = storage_obj.stores[curr_store_type]
399
+ assert store_object is not None and storage_obj.name is not None, (
400
+ store_object,
401
+ storage_obj.name,
402
+ )
403
+ storage_obj.source = storage_lib.StoreType.get_endpoint_url(
404
+ store_object, storage_obj.name
405
+ )
406
+ storage_obj.force_delete = True
407
+
408
+ # Step 7: Convert all `MOUNT` mode storages which don't specify a source
409
+ # to specifying a source. If the source is specified with a local path,
410
+ # it was handled in step 6.
411
+ updated_mount_storages = {}
412
+ for storage_path, storage_obj in task.storage_mounts.items():
413
+ if storage_obj.mode == storage_lib.StorageMode.MOUNT and not storage_obj.source:
414
+ # Construct source URL with first store type and storage name
415
+ # E.g., s3://my-storage-name
416
+ store_types = list(storage_obj.stores.keys())
417
+ assert len(store_types) == 1, (
418
+ 'We only support one store type for now.',
419
+ storage_obj.stores,
420
+ )
421
+ curr_store_type = store_types[0]
422
+ store_object = storage_obj.stores[curr_store_type]
423
+ assert store_object is not None and storage_obj.name is not None, (
424
+ store_object,
425
+ storage_obj.name,
426
+ )
427
+ source = storage_lib.StoreType.get_endpoint_url(
428
+ store_object, storage_obj.name
429
+ )
430
+ assert store_object is not None and storage_obj.name is not None, (
431
+ store_object,
432
+ storage_obj.name,
433
+ )
434
+ new_storage = storage_lib.Storage.from_yaml_config(
435
+ {
436
+ 'source': source,
437
+ 'persistent': storage_obj.persistent,
438
+ 'mode': storage_lib.StorageMode.MOUNT.value,
439
+ # We enable force delete to allow the controller to delete
440
+ # the object store in case persistent is set to False.
441
+ '_force_delete': True,
442
+ }
443
+ )
444
+ updated_mount_storages[storage_path] = new_storage
445
+ task.update_storage_mounts(updated_mount_storages)
446
+ if msg:
447
+ logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
@@ -0,0 +1,237 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, Callable, List, Optional
4
+
5
+ import kubernetes
6
+ import urllib3
7
+
8
+ from konduktor import config
9
+ from konduktor import logging as konduktor_logging
10
+ from konduktor.utils import annotations, ux_utils
11
+
12
+ logger = konduktor_logging.get_logger(__name__)
13
+
14
+ # Timeout to use for API calls
15
+ API_TIMEOUT = 5
16
+ DEFAULT_NAMESPACE = 'default'
17
+ DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
18
+ # The name for the environment variable that stores the in-cluster context name
19
+ # for Kubernetes clusters. This is used to associate a name with the current
20
+ # context when running with in-cluster auth. If not set, the context name is
21
+ # set to DEFAULT_IN_CLUSTER_REGION.
22
+ IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'KONDUKTOR_IN_CLUSTER_CONTEXT_NAME'
23
+
24
+ # Tracks the most recently selected/loaded context name.
25
+ # None means no explicit context was resolved/loaded yet.
26
+ _ACTIVE_CONTEXT: Optional[str] = None
27
+
28
+
29
+ def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
30
+ for attr_name in dir(obj):
31
+ attr = getattr(obj, attr_name)
32
+ # Skip methods starting with '__' since they are invoked through one
33
+ # of the main methods, which are already decorated.
34
+ if callable(attr) and not attr_name.startswith('__'):
35
+ continue
36
+ return obj
37
+
38
+
39
+ def _api_logging_decorator(logger: str, level: int):
40
+ """Decorator to set logging level for API calls.
41
+
42
+ This is used to suppress the verbose logging from urllib3 when calls to the
43
+ Kubernetes API timeout.
44
+ """
45
+
46
+ def decorated_api(api):
47
+ def wrapped(*args, **kwargs):
48
+ obj = api(*args, **kwargs)
49
+ _decorate_methods(
50
+ obj, konduktor_logging.set_logging_level(logger, level), 'api_log'
51
+ )
52
+ return obj
53
+
54
+ return wrapped
55
+
56
+ return decorated_api
57
+
58
+
59
+ def _load_config(context: Optional[str] = None):
60
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
61
+
62
+ # If no context explicitly provided, prefer the configured allowed context
63
+ # (first element) when present. This ensures the client defaults to the
64
+ # user-specified context instead of kubeconfig's current-context.
65
+ effective_context = context
66
+ allowed_contexts: List[str] = config.get_nested(
67
+ ('kubernetes', 'allowed_contexts'), []
68
+ )
69
+
70
+ is_allowed_selected = False
71
+ if effective_context is None and allowed_contexts:
72
+ effective_context = allowed_contexts[0]
73
+ is_allowed_selected = True
74
+ logger.info(
75
+ 'Detected kubernetes.allowed_contexts in config; using context: %s',
76
+ effective_context,
77
+ )
78
+
79
+ def _load_config_from_kubeconfig(context: Optional[str] = None):
80
+ try:
81
+ kubernetes.config.load_kube_config(context=context)
82
+ except kubernetes.config.config_exception.ConfigException as e:
83
+ # Improve error when a configured allowed context cannot be loaded
84
+ msg = str(e)
85
+ if is_allowed_selected and context is not None:
86
+ err_str = (
87
+ 'Configured Kubernetes context not usable: '
88
+ f'kubernetes.allowed_contexts[0] = {context!r}. '
89
+ 'Please ensure this context exists and is valid in your '
90
+ 'kubeconfig (typically at ~/.kube/config).'
91
+ )
92
+ elif 'Expected key current-context' in msg:
93
+ err_str = (
94
+ f'Failed to load Kubernetes configuration for {context!r}. '
95
+ 'Kubeconfig does not contain any valid context(s).\n'
96
+ )
97
+ else:
98
+ err_str = (
99
+ f'Failed to load Kubernetes configuration for {context!r}. '
100
+ 'Please check if your kubeconfig file exists at '
101
+ f'~/.kube/config and is valid.'
102
+ )
103
+ with ux_utils.print_exception_no_traceback():
104
+ raise ValueError(err_str) from None
105
+
106
+ global _ACTIVE_CONTEXT
107
+ if effective_context == in_cluster_context_name() or effective_context is None:
108
+ try:
109
+ # Load in-cluster config if running in a pod and context is None.
110
+ # Kubernetes set environment variables for service discovery do not
111
+ # show up in SkyPilot tasks. For now, we work around by using
112
+ # DNS name instead of environment variables.
113
+ # See issue: https://github.com/skypilot-org/skypilot/issues/2287
114
+ os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
115
+ os.environ['KUBERNETES_SERVICE_PORT'] = '443'
116
+ kubernetes.config.load_incluster_config()
117
+ _ACTIVE_CONTEXT = in_cluster_context_name()
118
+ except kubernetes.config.config_exception.ConfigException:
119
+ # If allowed_contexts was specified, do not fall back silently.
120
+ if is_allowed_selected:
121
+ _load_config_from_kubeconfig(effective_context)
122
+ else:
123
+ _load_config_from_kubeconfig()
124
+ # Best effort: set active context to current-context from kubeconfig
125
+ try:
126
+ _, current_ctx = kubernetes.config.list_kube_config_contexts()
127
+ _ACTIVE_CONTEXT = current_ctx.get('name') if current_ctx else None
128
+ except kubernetes.config.config_exception.ConfigException:
129
+ _ACTIVE_CONTEXT = None
130
+ else:
131
+ _load_config_from_kubeconfig(effective_context)
132
+ _ACTIVE_CONTEXT = effective_context
133
+
134
+
135
+ @_api_logging_decorator('urllib3', logging.ERROR)
136
+ @annotations.lru_cache(scope='request')
137
+ def core_api(context: Optional[str] = None):
138
+ _load_config(context)
139
+ return kubernetes.client.CoreV1Api()
140
+
141
+
142
+ @_api_logging_decorator('urllib3', logging.ERROR)
143
+ @annotations.lru_cache(scope='request')
144
+ def auth_api(context: Optional[str] = None):
145
+ _load_config(context)
146
+ return kubernetes.client.RbacAuthorizationV1Api()
147
+
148
+
149
+ @_api_logging_decorator('urllib3', logging.ERROR)
150
+ @annotations.lru_cache(scope='request')
151
+ def networking_api(context: Optional[str] = None):
152
+ _load_config(context)
153
+ return kubernetes.client.NetworkingV1Api()
154
+
155
+
156
+ @_api_logging_decorator('urllib3', logging.ERROR)
157
+ @annotations.lru_cache(scope='request')
158
+ def crd_api(context: Optional[str] = None):
159
+ _load_config(context)
160
+ return kubernetes.client.CustomObjectsApi()
161
+
162
+
163
+ @_api_logging_decorator('urllib3', logging.ERROR)
164
+ @annotations.lru_cache(scope='request')
165
+ def node_api(context: Optional[str] = None):
166
+ _load_config(context)
167
+ return kubernetes.client.NodeV1Api()
168
+
169
+
170
+ @_api_logging_decorator('urllib3', logging.ERROR)
171
+ @annotations.lru_cache(scope='request')
172
+ def apps_api(context: Optional[str] = None):
173
+ _load_config(context)
174
+ return kubernetes.client.AppsV1Api()
175
+
176
+
177
+ @_api_logging_decorator('urllib3', logging.ERROR)
178
+ @annotations.lru_cache(scope='request')
179
+ def api_client(context: Optional[str] = None):
180
+ _load_config(context)
181
+ return kubernetes.client.ApiClient()
182
+
183
+
184
+ @_api_logging_decorator('urllib3', logging.ERROR)
185
+ @annotations.lru_cache(scope='request')
186
+ def batch_api(context: Optional[str] = None):
187
+ _load_config(context)
188
+ return kubernetes.client.BatchV1Api()
189
+
190
+
191
+ @_api_logging_decorator('urllib3', logging.ERROR)
192
+ @annotations.lru_cache(scope='request')
193
+ def crd_client(context: Optional[str] = None):
194
+ _load_config(context)
195
+ return kubernetes.client.CustomObjectsApi()
196
+
197
+
198
+ @_api_logging_decorator('urllib3', logging.ERROR)
199
+ @annotations.lru_cache(scope='request')
200
+ def autoscaling_api(context: Optional[str] = None):
201
+ """Return the Kubernetes AutoscalingV2Api client."""
202
+ _load_config(context)
203
+ return kubernetes.client.AutoscalingV2Api()
204
+
205
+
206
+ def api_exception():
207
+ return kubernetes.client.rest.ApiException
208
+
209
+
210
+ def config_exception():
211
+ return kubernetes.config.config_exception.ConfigException
212
+
213
+
214
+ def max_retry_error():
215
+ return urllib3.exceptions.MaxRetryError
216
+
217
+
218
+ def stream():
219
+ return kubernetes.stream.stream
220
+
221
+
222
+ def in_cluster_context_name() -> Optional[str]:
223
+ """Returns the name of the in-cluster context from the environment.
224
+
225
+ If the environment variable is not set, returns the default in-cluster
226
+ context name.
227
+ """
228
+ return os.environ.get(IN_CLUSTER_CONTEXT_NAME_ENV_VAR) or DEFAULT_IN_CLUSTER_REGION
229
+
230
+
231
+ def get_active_context() -> Optional[str]:
232
+ """Returns the last context selected by the client loader.
233
+
234
+ This reflects the effective context used by the most recent client init.
235
+ May be None if no client has been initialized yet.
236
+ """
237
+ return _ACTIVE_CONTEXT