skypilot-nightly 1.0.0.dev20251001__py3-none-any.whl → 1.0.0.dev20251003__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -109
- sky/client/cli/command.py +2 -3
- sky/client/cli/table_utils.py +222 -1
- sky/clouds/cudo.py +1 -1
- sky/clouds/kubernetes.py +7 -19
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-4f0c389a4ce5fd9c.js → webpack-3286453d56f3c0a0.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +9 -0
- sky/global_user_state.py +16 -0
- sky/jobs/server/core.py +60 -53
- sky/jobs/state.py +21 -1
- sky/jobs/utils.py +29 -11
- sky/provision/kubernetes/config.py +0 -42
- sky/provision/kubernetes/instance.py +1 -33
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network_utils.py +0 -21
- sky/provision/kubernetes/utils.py +68 -322
- sky/schemas/api/responses.py +21 -0
- sky/server/requests/serializers/decoders.py +8 -0
- sky/server/requests/serializers/encoders.py +6 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -13
- sky/utils/env_options.py +4 -0
- sky/utils/kubernetes_enums.py +2 -15
- sky/utils/schemas.py +17 -6
- sky/volumes/client/sdk.py +3 -2
- sky/volumes/server/core.py +3 -2
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/RECORD +53 -56
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- sky/volumes/utils.py +0 -224
- /sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-3286453d56f3c0a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7528cc0ef8c522c5.js" defer=""></script><script src="/dashboard/_next/static/Haazh5IQz6F8Wyiqxcaj8/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Haazh5IQz6F8Wyiqxcaj8/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Haazh5IQz6F8Wyiqxcaj8","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/storage_utils.py
CHANGED
|
@@ -251,6 +251,15 @@ def zip_files_and_folders(items: List[str],
|
|
|
251
251
|
archive_name = _get_archive_name(item, item)
|
|
252
252
|
zipf.write(item, archive_name)
|
|
253
253
|
elif os.path.isdir(item):
|
|
254
|
+
# Include root dir
|
|
255
|
+
archive_name = _get_archive_name(item, item)
|
|
256
|
+
# If it's a symlink, store it as a symlink
|
|
257
|
+
if os.path.islink(item):
|
|
258
|
+
_store_symlink(zipf, item, archive_name, is_dir=True)
|
|
259
|
+
else:
|
|
260
|
+
zipf.write(item, archive_name)
|
|
261
|
+
|
|
262
|
+
# Include dir contents recursively
|
|
254
263
|
excluded_files = set([
|
|
255
264
|
os.path.join(item, f.rstrip('/'))
|
|
256
265
|
for f in get_excluded_files(item)
|
sky/global_user_state.py
CHANGED
|
@@ -1065,6 +1065,22 @@ def get_handle_from_cluster_name(
|
|
|
1065
1065
|
return pickle.loads(row.handle)
|
|
1066
1066
|
|
|
1067
1067
|
|
|
1068
|
+
@_init_db
|
|
1069
|
+
@metrics_lib.time_me
|
|
1070
|
+
def get_handles_from_cluster_names(
|
|
1071
|
+
cluster_names: Set[str]
|
|
1072
|
+
) -> Dict[str, Optional['backends.ResourceHandle']]:
|
|
1073
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1074
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1075
|
+
rows = session.query(cluster_table.c.name,
|
|
1076
|
+
cluster_table.c.handle).filter(
|
|
1077
|
+
cluster_table.c.name.in_(cluster_names)).all()
|
|
1078
|
+
return {
|
|
1079
|
+
row.name: pickle.loads(row.handle) if row is not None else None
|
|
1080
|
+
for row in rows
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
|
|
1068
1084
|
@_init_db_async
|
|
1069
1085
|
@metrics_lib.time_me
|
|
1070
1086
|
async def get_status_from_cluster_name_async(
|
sky/jobs/server/core.py
CHANGED
|
@@ -27,6 +27,7 @@ from sky.data import storage as storage_lib
|
|
|
27
27
|
from sky.jobs import constants as managed_job_constants
|
|
28
28
|
from sky.jobs import state as managed_job_state
|
|
29
29
|
from sky.jobs import utils as managed_job_utils
|
|
30
|
+
from sky.metrics import utils as metrics_lib
|
|
30
31
|
from sky.provision import common as provision_common
|
|
31
32
|
from sky.schemas.api import responses
|
|
32
33
|
from sky.serve import serve_state
|
|
@@ -666,6 +667,7 @@ def queue_v2_api(
|
|
|
666
667
|
], total, status_counts, total_no_filter
|
|
667
668
|
|
|
668
669
|
|
|
670
|
+
@metrics_lib.time_me
|
|
669
671
|
def queue_v2(
|
|
670
672
|
refresh: bool,
|
|
671
673
|
skip_finished: bool = False,
|
|
@@ -723,11 +725,12 @@ def queue_v2(
|
|
|
723
725
|
if page is not None:
|
|
724
726
|
raise ValueError('Limit must be specified when page is specified')
|
|
725
727
|
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
728
|
+
with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
|
|
729
|
+
handle = _maybe_restart_controller(refresh,
|
|
730
|
+
stopped_message='No in-progress '
|
|
731
|
+
'managed jobs.',
|
|
732
|
+
spinner_message='Checking '
|
|
733
|
+
'managed jobs')
|
|
731
734
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
732
735
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
733
736
|
|
|
@@ -778,70 +781,74 @@ def queue_v2(
|
|
|
778
781
|
except exceptions.SkyletMethodNotImplementedError:
|
|
779
782
|
pass
|
|
780
783
|
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
784
|
+
with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
|
|
785
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
786
|
+
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
787
|
+
name_match, pool_match, page, limit, user_hashes, statuses)
|
|
788
|
+
with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
|
|
789
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
790
|
+
handle,
|
|
791
|
+
code,
|
|
792
|
+
require_outputs=True,
|
|
793
|
+
stream_logs=False,
|
|
794
|
+
separate_stderr=True)
|
|
790
795
|
|
|
791
796
|
if returncode != 0:
|
|
792
797
|
logger.error(job_table_payload + stderr)
|
|
793
798
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
794
799
|
f'{returncode}.\n{job_table_payload + stderr}')
|
|
795
800
|
|
|
796
|
-
(jobs,
|
|
797
|
-
|
|
801
|
+
with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
|
|
802
|
+
(jobs, total, result_type, total_no_filter, status_counts
|
|
803
|
+
) = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
798
804
|
|
|
799
805
|
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
800
806
|
return jobs, total, status_counts, total_no_filter
|
|
801
807
|
|
|
802
808
|
# Backward compatibility for old jobs controller without filtering
|
|
803
809
|
# TODO(hailong): remove this after 0.12.0
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
807
|
-
user_hash = job.get('user_hash', None)
|
|
808
|
-
if user_hash is None:
|
|
809
|
-
# For backwards compatibility, we show jobs that do not have a
|
|
810
|
-
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
811
|
-
return True
|
|
812
|
-
return user_hash == common_utils.get_user_hash()
|
|
810
|
+
with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
|
|
811
|
+
if not all_users:
|
|
813
812
|
|
|
814
|
-
|
|
813
|
+
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
814
|
+
user_hash = job.get('user_hash', None)
|
|
815
|
+
if user_hash is None:
|
|
816
|
+
# For backwards compatibility, we show jobs that do not have
|
|
817
|
+
# a user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
818
|
+
return True
|
|
819
|
+
return user_hash == common_utils.get_user_hash()
|
|
815
820
|
|
|
816
|
-
|
|
817
|
-
filter(
|
|
818
|
-
lambda job: job.get('workspace', skylet_constants.
|
|
819
|
-
SKYPILOT_DEFAULT_WORKSPACE) in
|
|
820
|
-
accessible_workspaces, jobs))
|
|
821
|
+
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
|
821
822
|
|
|
822
|
-
if skip_finished:
|
|
823
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
824
|
-
# finished, we will include all its tasks.
|
|
825
|
-
non_finished_tasks = list(
|
|
826
|
-
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
827
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
828
823
|
jobs = list(
|
|
829
|
-
filter(
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
824
|
+
filter(
|
|
825
|
+
lambda job: job.get('workspace', skylet_constants.
|
|
826
|
+
SKYPILOT_DEFAULT_WORKSPACE) in
|
|
827
|
+
accessible_workspaces, jobs))
|
|
828
|
+
|
|
829
|
+
if skip_finished:
|
|
830
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
|
831
|
+
# finished, we will include all its tasks.
|
|
832
|
+
non_finished_tasks = list(
|
|
833
|
+
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
834
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
835
|
+
jobs = list(
|
|
836
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
837
|
+
|
|
838
|
+
if job_ids:
|
|
839
|
+
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
840
|
+
|
|
841
|
+
filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
|
|
842
|
+
jobs,
|
|
843
|
+
workspace_match,
|
|
844
|
+
name_match,
|
|
845
|
+
pool_match,
|
|
846
|
+
page=page,
|
|
847
|
+
limit=limit,
|
|
848
|
+
user_match=user_match,
|
|
849
|
+
enable_user_match=True,
|
|
850
|
+
statuses=statuses,
|
|
851
|
+
)
|
|
845
852
|
return filtered_jobs, total, status_counts, total_no_filter
|
|
846
853
|
|
|
847
854
|
|
sky/jobs/state.py
CHANGED
|
@@ -10,7 +10,8 @@ import sqlite3
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple,
|
|
13
|
+
from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
|
|
14
|
+
Union)
|
|
14
15
|
import urllib.parse
|
|
15
16
|
|
|
16
17
|
import colorama
|
|
@@ -1250,6 +1251,25 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
|
1250
1251
|
return pool[0] if pool else None
|
|
1251
1252
|
|
|
1252
1253
|
|
|
1254
|
+
@_init_db
|
|
1255
|
+
def get_pool_and_submit_info_from_job_ids(
|
|
1256
|
+
job_ids: Set[int]
|
|
1257
|
+
) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
|
|
1258
|
+
"""Get the pool, cluster name, and job id on pool from job id"""
|
|
1259
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1260
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1261
|
+
rows = session.execute(
|
|
1262
|
+
sqlalchemy.select(
|
|
1263
|
+
job_info_table.c.spot_job_id, job_info_table.c.pool,
|
|
1264
|
+
job_info_table.c.current_cluster_name,
|
|
1265
|
+
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1266
|
+
job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
|
|
1267
|
+
return {
|
|
1268
|
+
job_id: (pool, cluster_name, job_id_on_pool_cluster)
|
|
1269
|
+
for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
|
|
1253
1273
|
@_init_db
|
|
1254
1274
|
def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
1255
1275
|
"""Set the current cluster name for a job."""
|
sky/jobs/utils.py
CHANGED
|
@@ -1325,6 +1325,23 @@ def get_managed_job_queue(
|
|
|
1325
1325
|
page,
|
|
1326
1326
|
limit,
|
|
1327
1327
|
statuses=statuses)
|
|
1328
|
+
|
|
1329
|
+
job_ids = set(job['job_id'] for job in jobs)
|
|
1330
|
+
job_id_to_pool_info = (
|
|
1331
|
+
managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
|
|
1332
|
+
cluster_names: Dict[int, str] = {}
|
|
1333
|
+
for job in jobs:
|
|
1334
|
+
# pool info is (pool, cluster_name, job_id_on_pool_cluster)
|
|
1335
|
+
pool_info = job_id_to_pool_info.get(job['job_id'], None)
|
|
1336
|
+
if pool_info and pool_info[0]:
|
|
1337
|
+
cluster_name = pool_info[1]
|
|
1338
|
+
else:
|
|
1339
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1340
|
+
job['task_name'], job['job_id'])
|
|
1341
|
+
cluster_names[job['job_id']] = cluster_name
|
|
1342
|
+
cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
|
|
1343
|
+
set(cluster_names.values()))
|
|
1344
|
+
|
|
1328
1345
|
for job in jobs:
|
|
1329
1346
|
end_at = job['end_at']
|
|
1330
1347
|
if end_at is None:
|
|
@@ -1344,15 +1361,8 @@ def get_managed_job_queue(
|
|
|
1344
1361
|
job['status'] = job['status'].value
|
|
1345
1362
|
job['schedule_state'] = job['schedule_state'].value
|
|
1346
1363
|
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
cluster_name, _ = managed_job_state.get_pool_submit_info(
|
|
1350
|
-
job['job_id'])
|
|
1351
|
-
else:
|
|
1352
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
1353
|
-
job['task_name'], job['job_id'])
|
|
1354
|
-
handle = global_user_state.get_handle_from_cluster_name(
|
|
1355
|
-
cluster_name) if cluster_name is not None else None
|
|
1364
|
+
cluster_name = cluster_names[job['job_id']]
|
|
1365
|
+
handle = cluster_name_to_handles.get(cluster_name, None)
|
|
1356
1366
|
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1357
1367
|
resources_str = resources_utils.get_readable_resources_repr(
|
|
1358
1368
|
handle, simplify=True)
|
|
@@ -1507,12 +1517,20 @@ def load_managed_job_queue(
|
|
|
1507
1517
|
total_no_filter = total
|
|
1508
1518
|
result_type = ManagedJobQueueResultType.LIST
|
|
1509
1519
|
|
|
1520
|
+
job_id_to_user_hash: Dict[int, str] = {}
|
|
1510
1521
|
for job in jobs:
|
|
1511
|
-
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1512
1522
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
1513
1523
|
# Skip jobs that do not have user_hash info.
|
|
1514
1524
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1515
|
-
|
|
1525
|
+
job_id_to_user_hash[job['job_id']] = job['user_hash']
|
|
1526
|
+
user_hash_to_user = global_user_state.get_users(
|
|
1527
|
+
job_id_to_user_hash.values())
|
|
1528
|
+
|
|
1529
|
+
for job in jobs:
|
|
1530
|
+
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1531
|
+
if job['job_id'] in job_id_to_user_hash:
|
|
1532
|
+
user_hash = job_id_to_user_hash[job['job_id']]
|
|
1533
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
1516
1534
|
job['user_name'] = user.name if user is not None else None
|
|
1517
1535
|
return jobs, total, result_type, total_no_filter, status_counts
|
|
1518
1536
|
|
|
@@ -7,9 +7,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
7
7
|
|
|
8
8
|
from sky.adaptors import kubernetes
|
|
9
9
|
from sky.provision import common
|
|
10
|
-
from sky.provision.kubernetes import network_utils
|
|
11
10
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
-
from sky.utils import kubernetes_enums
|
|
13
11
|
from sky.utils import yaml_utils
|
|
14
12
|
|
|
15
13
|
logger = logging.getLogger(__name__)
|
|
@@ -28,11 +26,6 @@ def bootstrap_instances(
|
|
|
28
26
|
|
|
29
27
|
_configure_services(namespace, context, config.provider_config)
|
|
30
28
|
|
|
31
|
-
networking_mode = network_utils.get_networking_mode(
|
|
32
|
-
config.provider_config.get('networking_mode'), context)
|
|
33
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
34
|
-
config = _configure_ssh_jump(namespace, context, config)
|
|
35
|
-
|
|
36
29
|
requested_service_account = config.node_config['spec']['serviceAccountName']
|
|
37
30
|
if (requested_service_account ==
|
|
38
31
|
kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
|
|
@@ -481,41 +474,6 @@ def _configure_autoscaler_cluster_role_binding(
|
|
|
481
474
|
f'{created_msg(binding_field, name)}')
|
|
482
475
|
|
|
483
476
|
|
|
484
|
-
def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
|
|
485
|
-
"""Creates a SSH jump pod to connect to the cluster.
|
|
486
|
-
|
|
487
|
-
Also updates config['auth']['ssh_proxy_command'] to use the newly created
|
|
488
|
-
jump pod.
|
|
489
|
-
"""
|
|
490
|
-
provider_config = config.provider_config
|
|
491
|
-
pod_cfg = config.node_config
|
|
492
|
-
|
|
493
|
-
ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
|
|
494
|
-
ssh_jump_image = provider_config['ssh_jump_image']
|
|
495
|
-
|
|
496
|
-
volumes = pod_cfg['spec']['volumes']
|
|
497
|
-
# find 'secret-volume' and get the secret name
|
|
498
|
-
secret_volume = next(filter(lambda x: x['name'] == 'secret-volume',
|
|
499
|
-
volumes))
|
|
500
|
-
ssh_key_secret_name = secret_volume['secret']['secretName']
|
|
501
|
-
|
|
502
|
-
# TODO(romilb): We currently split SSH jump pod and svc creation. Service
|
|
503
|
-
# is first created in authentication.py::setup_kubernetes_authentication
|
|
504
|
-
# and then SSH jump pod creation happens here. This is because we need to
|
|
505
|
-
# set the ssh_proxy_command in the ray YAML before we pass it to the
|
|
506
|
-
# autoscaler. If in the future if we can write the ssh_proxy_command to the
|
|
507
|
-
# cluster yaml through this method, then we should move the service
|
|
508
|
-
# creation here.
|
|
509
|
-
|
|
510
|
-
# TODO(romilb): We should add a check here to make sure the service is up
|
|
511
|
-
# and available before we create the SSH jump pod. If for any reason the
|
|
512
|
-
# service is missing, we should raise an error.
|
|
513
|
-
|
|
514
|
-
kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
|
|
515
|
-
ssh_key_secret_name, namespace, context)
|
|
516
|
-
return config
|
|
517
|
-
|
|
518
|
-
|
|
519
477
|
def _configure_skypilot_system_namespace(
|
|
520
478
|
provider_config: Dict[str, Any]) -> None:
|
|
521
479
|
"""Creates the namespace for skypilot-system mounting if it does not exist.
|
|
@@ -17,7 +17,6 @@ from sky.provision import constants
|
|
|
17
17
|
from sky.provision import docker_utils
|
|
18
18
|
from sky.provision.kubernetes import config as config_lib
|
|
19
19
|
from sky.provision.kubernetes import constants as k8s_constants
|
|
20
|
-
from sky.provision.kubernetes import network_utils
|
|
21
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
22
21
|
from sky.provision.kubernetes import volume
|
|
23
22
|
from sky.utils import command_runner
|
|
@@ -1148,15 +1147,6 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1148
1147
|
if head_pod_name is None and _is_head(pod):
|
|
1149
1148
|
head_pod_name = pod.metadata.name
|
|
1150
1149
|
|
|
1151
|
-
networking_mode = network_utils.get_networking_mode(
|
|
1152
|
-
config.provider_config.get('networking_mode'), context)
|
|
1153
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
1154
|
-
# Adding the jump pod to the new_nodes list as well so it can be
|
|
1155
|
-
# checked if it's scheduled and running along with other pods.
|
|
1156
|
-
ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
|
|
1157
|
-
jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
1158
|
-
ssh_jump_pod_name, namespace)
|
|
1159
|
-
pods.append(jump_pod)
|
|
1160
1150
|
provision_timeout = provider_config['timeout']
|
|
1161
1151
|
|
|
1162
1152
|
wait_str = ('indefinitely'
|
|
@@ -1320,18 +1310,6 @@ def terminate_instances(
|
|
|
1320
1310
|
ray_tag_filter(cluster_name_on_cloud),
|
|
1321
1311
|
None)
|
|
1322
1312
|
|
|
1323
|
-
# Clean up the SSH jump pod if in use
|
|
1324
|
-
networking_mode = network_utils.get_networking_mode(
|
|
1325
|
-
provider_config.get('networking_mode'), context)
|
|
1326
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
1327
|
-
pod_name = list(pods.keys())[0]
|
|
1328
|
-
try:
|
|
1329
|
-
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
|
|
1330
|
-
pod_name)
|
|
1331
|
-
except Exception as e: # pylint: disable=broad-except
|
|
1332
|
-
logger.warning('terminate_instances: Error occurred when analyzing '
|
|
1333
|
-
f'SSH Jump pod: {e}')
|
|
1334
|
-
|
|
1335
1313
|
if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
|
|
1336
1314
|
namespace):
|
|
1337
1315
|
# For high availability controllers, terminate the deployment
|
|
@@ -1367,15 +1345,6 @@ def get_cluster_info(
|
|
|
1367
1345
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
|
1368
1346
|
head_pod_name = None
|
|
1369
1347
|
|
|
1370
|
-
port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
|
|
1371
|
-
network_mode_str = skypilot_config.get_effective_region_config(
|
|
1372
|
-
cloud='kubernetes',
|
|
1373
|
-
region=context,
|
|
1374
|
-
keys=('networking_mode',),
|
|
1375
|
-
default_value=port_forward_mode.value)
|
|
1376
|
-
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
|
1377
|
-
network_mode_str)
|
|
1378
|
-
external_ip = kubernetes_utils.get_external_ip(network_mode, context)
|
|
1379
1348
|
port = 22
|
|
1380
1349
|
if not provider_config.get('use_internal_ips', False):
|
|
1381
1350
|
port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
|
|
@@ -1389,8 +1358,7 @@ def get_cluster_info(
|
|
|
1389
1358
|
common.InstanceInfo(
|
|
1390
1359
|
instance_id=pod_name,
|
|
1391
1360
|
internal_ip=internal_ip,
|
|
1392
|
-
external_ip=
|
|
1393
|
-
external_ip),
|
|
1361
|
+
external_ip=None,
|
|
1394
1362
|
ssh_port=port,
|
|
1395
1363
|
tags=pod.metadata.labels,
|
|
1396
1364
|
)
|
|
@@ -23,8 +23,7 @@ spec:
|
|
|
23
23
|
effect: NoExecute
|
|
24
24
|
containers:
|
|
25
25
|
- name: server
|
|
26
|
-
|
|
27
|
-
image: berkeleyskypilot/fusermount-server:latest
|
|
26
|
+
image: berkeleyskypilot/fusermount-server:0.2.1
|
|
28
27
|
securityContext:
|
|
29
28
|
privileged: true
|
|
30
29
|
volumeMounts:
|
|
@@ -55,27 +55,6 @@ def get_port_mode(
|
|
|
55
55
|
return port_mode
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
def get_networking_mode(
|
|
59
|
-
mode_str: Optional[str],
|
|
60
|
-
context: Optional[str],
|
|
61
|
-
) -> kubernetes_enums.KubernetesNetworkingMode:
|
|
62
|
-
"""Get the networking mode from the provider config."""
|
|
63
|
-
mode_str = mode_str or skypilot_config.get_effective_region_config(
|
|
64
|
-
cloud='kubernetes',
|
|
65
|
-
region=context,
|
|
66
|
-
keys=('networking_mode',),
|
|
67
|
-
default_value=kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.
|
|
68
|
-
value)
|
|
69
|
-
try:
|
|
70
|
-
networking_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
|
71
|
-
mode_str)
|
|
72
|
-
except ValueError as e:
|
|
73
|
-
with ux_utils.print_exception_no_traceback():
|
|
74
|
-
raise ValueError(str(e) +
|
|
75
|
-
' Please check: ~/.sky/config.yaml.') from None
|
|
76
|
-
return networking_mode
|
|
77
|
-
|
|
78
|
-
|
|
79
58
|
def fill_loadbalancer_template(namespace: str, context: Optional[str],
|
|
80
59
|
service_name: str, ports: List[int],
|
|
81
60
|
selector_key: str, selector_value: str) -> Dict:
|