skypilot-nightly 1.0.0.dev20251002__py3-none-any.whl → 1.0.0.dev20251004__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -109
- sky/backends/cloud_vm_ray_backend.py +42 -27
- sky/client/cli/command.py +1 -11
- sky/clouds/cudo.py +1 -1
- sky/clouds/kubernetes.py +7 -19
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-7340bc0f0dd8ae74.js → webpack-3286453d56f3c0a0.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +9 -0
- sky/execution.py +24 -2
- sky/global_user_state.py +16 -0
- sky/jobs/recovery_strategy.py +45 -0
- sky/jobs/server/core.py +60 -53
- sky/jobs/state.py +21 -1
- sky/jobs/utils.py +29 -11
- sky/provision/kubernetes/config.py +0 -42
- sky/provision/kubernetes/instance.py +1 -33
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network_utils.py +0 -21
- sky/provision/kubernetes/utils.py +136 -300
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +6 -0
- sky/server/server.py +6 -0
- sky/setup_files/dependencies.py +1 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -13
- sky/utils/context.py +12 -7
- sky/utils/env_options.py +4 -0
- sky/utils/kubernetes_enums.py +2 -15
- sky/utils/schemas.py +17 -6
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/METADATA +38 -37
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/RECORD +55 -56
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- /sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -7,6 +7,7 @@ resources:
|
|
|
7
7
|
"""
|
|
8
8
|
import asyncio
|
|
9
9
|
import logging
|
|
10
|
+
import os
|
|
10
11
|
import traceback
|
|
11
12
|
import typing
|
|
12
13
|
from typing import Optional, Set
|
|
@@ -16,16 +17,19 @@ from sky import dag as dag_lib
|
|
|
16
17
|
from sky import exceptions
|
|
17
18
|
from sky import global_user_state
|
|
18
19
|
from sky import sky_logging
|
|
20
|
+
from sky import skypilot_config
|
|
19
21
|
from sky.backends import backend_utils
|
|
20
22
|
from sky.client import sdk
|
|
21
23
|
from sky.jobs import scheduler
|
|
22
24
|
from sky.jobs import state
|
|
23
25
|
from sky.jobs import utils as managed_job_utils
|
|
24
26
|
from sky.serve import serve_utils
|
|
27
|
+
from sky.skylet import constants
|
|
25
28
|
from sky.skylet import job_lib
|
|
26
29
|
from sky.usage import usage_lib
|
|
27
30
|
from sky.utils import common_utils
|
|
28
31
|
from sky.utils import context_utils
|
|
32
|
+
from sky.utils import env_options
|
|
29
33
|
from sky.utils import registry
|
|
30
34
|
from sky.utils import status_lib
|
|
31
35
|
from sky.utils import ux_utils
|
|
@@ -45,6 +49,13 @@ MAX_JOB_CHECKING_RETRY = 10
|
|
|
45
49
|
# cluster before its status can be updated by the job controller.
|
|
46
50
|
_AUTODOWN_MINUTES = 10
|
|
47
51
|
|
|
52
|
+
ENV_VARS_TO_CLEAR = [
|
|
53
|
+
skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
|
|
54
|
+
constants.USER_ID_ENV_VAR,
|
|
55
|
+
constants.USER_ENV_VAR,
|
|
56
|
+
env_options.Options.SHOW_DEBUG_INFO.env_key,
|
|
57
|
+
]
|
|
58
|
+
|
|
48
59
|
|
|
49
60
|
class StrategyExecutor:
|
|
50
61
|
"""Handle the launching, recovery and termination of managed job clusters"""
|
|
@@ -213,6 +224,7 @@ class StrategyExecutor:
|
|
|
213
224
|
**kwargs,
|
|
214
225
|
_try_cancel_if_cluster_is_init=True,
|
|
215
226
|
)
|
|
227
|
+
self._logger.debug(f'sdk.cancel request ID: {request_id}')
|
|
216
228
|
await context_utils.to_thread(
|
|
217
229
|
sdk.get,
|
|
218
230
|
request_id,
|
|
@@ -371,6 +383,31 @@ class StrategyExecutor:
|
|
|
371
383
|
usage_lib.messages.usage.set_internal()
|
|
372
384
|
if self.pool is None:
|
|
373
385
|
assert self.cluster_name is not None
|
|
386
|
+
|
|
387
|
+
# sdk.launch will implicitly start the API server,
|
|
388
|
+
# but then the API server will inherit the current
|
|
389
|
+
# env vars/user, which we may not want.
|
|
390
|
+
# Instead, clear env vars here and call api_start
|
|
391
|
+
# explicitly.
|
|
392
|
+
vars_to_restore = {}
|
|
393
|
+
try:
|
|
394
|
+
for env_var in ENV_VARS_TO_CLEAR:
|
|
395
|
+
vars_to_restore[env_var] = os.environ.pop(
|
|
396
|
+
env_var, None)
|
|
397
|
+
self._logger.debug('Cleared env var: '
|
|
398
|
+
f'{env_var}')
|
|
399
|
+
self._logger.debug('Env vars for api_start: '
|
|
400
|
+
f'{os.environ}')
|
|
401
|
+
await context_utils.to_thread(sdk.api_start)
|
|
402
|
+
self._logger.info('API server started.')
|
|
403
|
+
finally:
|
|
404
|
+
for env_var, value in vars_to_restore.items():
|
|
405
|
+
if value is not None:
|
|
406
|
+
self._logger.debug(
|
|
407
|
+
'Restored env var: '
|
|
408
|
+
f'{env_var}: {value}')
|
|
409
|
+
os.environ[env_var] = value
|
|
410
|
+
|
|
374
411
|
log_file = _get_logger_file(self._logger)
|
|
375
412
|
request_id = None
|
|
376
413
|
try:
|
|
@@ -392,6 +429,8 @@ class StrategyExecutor:
|
|
|
392
429
|
# down=True,
|
|
393
430
|
_is_launched_by_jobs_controller=True,
|
|
394
431
|
)
|
|
432
|
+
self._logger.debug('sdk.launch request ID: '
|
|
433
|
+
f'{request_id}')
|
|
395
434
|
if log_file is None:
|
|
396
435
|
raise OSError('Log file is None')
|
|
397
436
|
with open(log_file, 'a', encoding='utf-8') as f:
|
|
@@ -404,6 +443,8 @@ class StrategyExecutor:
|
|
|
404
443
|
if request_id:
|
|
405
444
|
req = await context_utils.to_thread(
|
|
406
445
|
sdk.api_cancel, request_id)
|
|
446
|
+
self._logger.debug('sdk.api_cancel request '
|
|
447
|
+
f'ID: {req}')
|
|
407
448
|
try:
|
|
408
449
|
await context_utils.to_thread(
|
|
409
450
|
sdk.get, req)
|
|
@@ -427,6 +468,8 @@ class StrategyExecutor:
|
|
|
427
468
|
self.dag,
|
|
428
469
|
cluster_name=self.cluster_name,
|
|
429
470
|
)
|
|
471
|
+
self._logger.debug('sdk.exec request ID: '
|
|
472
|
+
f'{request_id}')
|
|
430
473
|
job_id_on_pool_cluster, _ = (
|
|
431
474
|
await context_utils.to_thread(
|
|
432
475
|
sdk.get, request_id))
|
|
@@ -434,6 +477,8 @@ class StrategyExecutor:
|
|
|
434
477
|
if request_id:
|
|
435
478
|
req = await context_utils.to_thread(
|
|
436
479
|
sdk.api_cancel, request_id)
|
|
480
|
+
self._logger.debug('sdk.api_cancel request '
|
|
481
|
+
f'ID: {req}')
|
|
437
482
|
try:
|
|
438
483
|
await context_utils.to_thread(
|
|
439
484
|
sdk.get, req)
|
sky/jobs/server/core.py
CHANGED
|
@@ -27,6 +27,7 @@ from sky.data import storage as storage_lib
|
|
|
27
27
|
from sky.jobs import constants as managed_job_constants
|
|
28
28
|
from sky.jobs import state as managed_job_state
|
|
29
29
|
from sky.jobs import utils as managed_job_utils
|
|
30
|
+
from sky.metrics import utils as metrics_lib
|
|
30
31
|
from sky.provision import common as provision_common
|
|
31
32
|
from sky.schemas.api import responses
|
|
32
33
|
from sky.serve import serve_state
|
|
@@ -666,6 +667,7 @@ def queue_v2_api(
|
|
|
666
667
|
], total, status_counts, total_no_filter
|
|
667
668
|
|
|
668
669
|
|
|
670
|
+
@metrics_lib.time_me
|
|
669
671
|
def queue_v2(
|
|
670
672
|
refresh: bool,
|
|
671
673
|
skip_finished: bool = False,
|
|
@@ -723,11 +725,12 @@ def queue_v2(
|
|
|
723
725
|
if page is not None:
|
|
724
726
|
raise ValueError('Limit must be specified when page is specified')
|
|
725
727
|
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
728
|
+
with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
|
|
729
|
+
handle = _maybe_restart_controller(refresh,
|
|
730
|
+
stopped_message='No in-progress '
|
|
731
|
+
'managed jobs.',
|
|
732
|
+
spinner_message='Checking '
|
|
733
|
+
'managed jobs')
|
|
731
734
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
732
735
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
733
736
|
|
|
@@ -778,70 +781,74 @@ def queue_v2(
|
|
|
778
781
|
except exceptions.SkyletMethodNotImplementedError:
|
|
779
782
|
pass
|
|
780
783
|
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
784
|
+
with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
|
|
785
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
786
|
+
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
787
|
+
name_match, pool_match, page, limit, user_hashes, statuses)
|
|
788
|
+
with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
|
|
789
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
790
|
+
handle,
|
|
791
|
+
code,
|
|
792
|
+
require_outputs=True,
|
|
793
|
+
stream_logs=False,
|
|
794
|
+
separate_stderr=True)
|
|
790
795
|
|
|
791
796
|
if returncode != 0:
|
|
792
797
|
logger.error(job_table_payload + stderr)
|
|
793
798
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
794
799
|
f'{returncode}.\n{job_table_payload + stderr}')
|
|
795
800
|
|
|
796
|
-
(jobs,
|
|
797
|
-
|
|
801
|
+
with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
|
|
802
|
+
(jobs, total, result_type, total_no_filter, status_counts
|
|
803
|
+
) = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
798
804
|
|
|
799
805
|
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
800
806
|
return jobs, total, status_counts, total_no_filter
|
|
801
807
|
|
|
802
808
|
# Backward compatibility for old jobs controller without filtering
|
|
803
809
|
# TODO(hailong): remove this after 0.12.0
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
807
|
-
user_hash = job.get('user_hash', None)
|
|
808
|
-
if user_hash is None:
|
|
809
|
-
# For backwards compatibility, we show jobs that do not have a
|
|
810
|
-
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
811
|
-
return True
|
|
812
|
-
return user_hash == common_utils.get_user_hash()
|
|
810
|
+
with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
|
|
811
|
+
if not all_users:
|
|
813
812
|
|
|
814
|
-
|
|
813
|
+
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
814
|
+
user_hash = job.get('user_hash', None)
|
|
815
|
+
if user_hash is None:
|
|
816
|
+
# For backwards compatibility, we show jobs that do not have
|
|
817
|
+
# a user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
818
|
+
return True
|
|
819
|
+
return user_hash == common_utils.get_user_hash()
|
|
815
820
|
|
|
816
|
-
|
|
817
|
-
filter(
|
|
818
|
-
lambda job: job.get('workspace', skylet_constants.
|
|
819
|
-
SKYPILOT_DEFAULT_WORKSPACE) in
|
|
820
|
-
accessible_workspaces, jobs))
|
|
821
|
+
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
|
821
822
|
|
|
822
|
-
if skip_finished:
|
|
823
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
824
|
-
# finished, we will include all its tasks.
|
|
825
|
-
non_finished_tasks = list(
|
|
826
|
-
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
827
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
828
823
|
jobs = list(
|
|
829
|
-
filter(
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
824
|
+
filter(
|
|
825
|
+
lambda job: job.get('workspace', skylet_constants.
|
|
826
|
+
SKYPILOT_DEFAULT_WORKSPACE) in
|
|
827
|
+
accessible_workspaces, jobs))
|
|
828
|
+
|
|
829
|
+
if skip_finished:
|
|
830
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
|
831
|
+
# finished, we will include all its tasks.
|
|
832
|
+
non_finished_tasks = list(
|
|
833
|
+
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
834
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
835
|
+
jobs = list(
|
|
836
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
837
|
+
|
|
838
|
+
if job_ids:
|
|
839
|
+
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
840
|
+
|
|
841
|
+
filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
|
|
842
|
+
jobs,
|
|
843
|
+
workspace_match,
|
|
844
|
+
name_match,
|
|
845
|
+
pool_match,
|
|
846
|
+
page=page,
|
|
847
|
+
limit=limit,
|
|
848
|
+
user_match=user_match,
|
|
849
|
+
enable_user_match=True,
|
|
850
|
+
statuses=statuses,
|
|
851
|
+
)
|
|
845
852
|
return filtered_jobs, total, status_counts, total_no_filter
|
|
846
853
|
|
|
847
854
|
|
sky/jobs/state.py
CHANGED
|
@@ -10,7 +10,8 @@ import sqlite3
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple,
|
|
13
|
+
from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
|
|
14
|
+
Union)
|
|
14
15
|
import urllib.parse
|
|
15
16
|
|
|
16
17
|
import colorama
|
|
@@ -1250,6 +1251,25 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
|
1250
1251
|
return pool[0] if pool else None
|
|
1251
1252
|
|
|
1252
1253
|
|
|
1254
|
+
@_init_db
|
|
1255
|
+
def get_pool_and_submit_info_from_job_ids(
|
|
1256
|
+
job_ids: Set[int]
|
|
1257
|
+
) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
|
|
1258
|
+
"""Get the pool, cluster name, and job id on pool from job id"""
|
|
1259
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1260
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1261
|
+
rows = session.execute(
|
|
1262
|
+
sqlalchemy.select(
|
|
1263
|
+
job_info_table.c.spot_job_id, job_info_table.c.pool,
|
|
1264
|
+
job_info_table.c.current_cluster_name,
|
|
1265
|
+
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1266
|
+
job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
|
|
1267
|
+
return {
|
|
1268
|
+
job_id: (pool, cluster_name, job_id_on_pool_cluster)
|
|
1269
|
+
for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
|
|
1253
1273
|
@_init_db
|
|
1254
1274
|
def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
1255
1275
|
"""Set the current cluster name for a job."""
|
sky/jobs/utils.py
CHANGED
|
@@ -1325,6 +1325,23 @@ def get_managed_job_queue(
|
|
|
1325
1325
|
page,
|
|
1326
1326
|
limit,
|
|
1327
1327
|
statuses=statuses)
|
|
1328
|
+
|
|
1329
|
+
job_ids = set(job['job_id'] for job in jobs)
|
|
1330
|
+
job_id_to_pool_info = (
|
|
1331
|
+
managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
|
|
1332
|
+
cluster_names: Dict[int, str] = {}
|
|
1333
|
+
for job in jobs:
|
|
1334
|
+
# pool info is (pool, cluster_name, job_id_on_pool_cluster)
|
|
1335
|
+
pool_info = job_id_to_pool_info.get(job['job_id'], None)
|
|
1336
|
+
if pool_info and pool_info[0]:
|
|
1337
|
+
cluster_name = pool_info[1]
|
|
1338
|
+
else:
|
|
1339
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1340
|
+
job['task_name'], job['job_id'])
|
|
1341
|
+
cluster_names[job['job_id']] = cluster_name
|
|
1342
|
+
cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
|
|
1343
|
+
set(cluster_names.values()))
|
|
1344
|
+
|
|
1328
1345
|
for job in jobs:
|
|
1329
1346
|
end_at = job['end_at']
|
|
1330
1347
|
if end_at is None:
|
|
@@ -1344,15 +1361,8 @@ def get_managed_job_queue(
|
|
|
1344
1361
|
job['status'] = job['status'].value
|
|
1345
1362
|
job['schedule_state'] = job['schedule_state'].value
|
|
1346
1363
|
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
cluster_name, _ = managed_job_state.get_pool_submit_info(
|
|
1350
|
-
job['job_id'])
|
|
1351
|
-
else:
|
|
1352
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
1353
|
-
job['task_name'], job['job_id'])
|
|
1354
|
-
handle = global_user_state.get_handle_from_cluster_name(
|
|
1355
|
-
cluster_name) if cluster_name is not None else None
|
|
1364
|
+
cluster_name = cluster_names[job['job_id']]
|
|
1365
|
+
handle = cluster_name_to_handles.get(cluster_name, None)
|
|
1356
1366
|
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1357
1367
|
resources_str = resources_utils.get_readable_resources_repr(
|
|
1358
1368
|
handle, simplify=True)
|
|
@@ -1507,12 +1517,20 @@ def load_managed_job_queue(
|
|
|
1507
1517
|
total_no_filter = total
|
|
1508
1518
|
result_type = ManagedJobQueueResultType.LIST
|
|
1509
1519
|
|
|
1520
|
+
job_id_to_user_hash: Dict[int, str] = {}
|
|
1510
1521
|
for job in jobs:
|
|
1511
|
-
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1512
1522
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
1513
1523
|
# Skip jobs that do not have user_hash info.
|
|
1514
1524
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1515
|
-
|
|
1525
|
+
job_id_to_user_hash[job['job_id']] = job['user_hash']
|
|
1526
|
+
user_hash_to_user = global_user_state.get_users(
|
|
1527
|
+
job_id_to_user_hash.values())
|
|
1528
|
+
|
|
1529
|
+
for job in jobs:
|
|
1530
|
+
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1531
|
+
if job['job_id'] in job_id_to_user_hash:
|
|
1532
|
+
user_hash = job_id_to_user_hash[job['job_id']]
|
|
1533
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
1516
1534
|
job['user_name'] = user.name if user is not None else None
|
|
1517
1535
|
return jobs, total, result_type, total_no_filter, status_counts
|
|
1518
1536
|
|
|
@@ -7,9 +7,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
7
7
|
|
|
8
8
|
from sky.adaptors import kubernetes
|
|
9
9
|
from sky.provision import common
|
|
10
|
-
from sky.provision.kubernetes import network_utils
|
|
11
10
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
-
from sky.utils import kubernetes_enums
|
|
13
11
|
from sky.utils import yaml_utils
|
|
14
12
|
|
|
15
13
|
logger = logging.getLogger(__name__)
|
|
@@ -28,11 +26,6 @@ def bootstrap_instances(
|
|
|
28
26
|
|
|
29
27
|
_configure_services(namespace, context, config.provider_config)
|
|
30
28
|
|
|
31
|
-
networking_mode = network_utils.get_networking_mode(
|
|
32
|
-
config.provider_config.get('networking_mode'), context)
|
|
33
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
34
|
-
config = _configure_ssh_jump(namespace, context, config)
|
|
35
|
-
|
|
36
29
|
requested_service_account = config.node_config['spec']['serviceAccountName']
|
|
37
30
|
if (requested_service_account ==
|
|
38
31
|
kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
|
|
@@ -481,41 +474,6 @@ def _configure_autoscaler_cluster_role_binding(
|
|
|
481
474
|
f'{created_msg(binding_field, name)}')
|
|
482
475
|
|
|
483
476
|
|
|
484
|
-
def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
|
|
485
|
-
"""Creates a SSH jump pod to connect to the cluster.
|
|
486
|
-
|
|
487
|
-
Also updates config['auth']['ssh_proxy_command'] to use the newly created
|
|
488
|
-
jump pod.
|
|
489
|
-
"""
|
|
490
|
-
provider_config = config.provider_config
|
|
491
|
-
pod_cfg = config.node_config
|
|
492
|
-
|
|
493
|
-
ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
|
|
494
|
-
ssh_jump_image = provider_config['ssh_jump_image']
|
|
495
|
-
|
|
496
|
-
volumes = pod_cfg['spec']['volumes']
|
|
497
|
-
# find 'secret-volume' and get the secret name
|
|
498
|
-
secret_volume = next(filter(lambda x: x['name'] == 'secret-volume',
|
|
499
|
-
volumes))
|
|
500
|
-
ssh_key_secret_name = secret_volume['secret']['secretName']
|
|
501
|
-
|
|
502
|
-
# TODO(romilb): We currently split SSH jump pod and svc creation. Service
|
|
503
|
-
# is first created in authentication.py::setup_kubernetes_authentication
|
|
504
|
-
# and then SSH jump pod creation happens here. This is because we need to
|
|
505
|
-
# set the ssh_proxy_command in the ray YAML before we pass it to the
|
|
506
|
-
# autoscaler. If in the future if we can write the ssh_proxy_command to the
|
|
507
|
-
# cluster yaml through this method, then we should move the service
|
|
508
|
-
# creation here.
|
|
509
|
-
|
|
510
|
-
# TODO(romilb): We should add a check here to make sure the service is up
|
|
511
|
-
# and available before we create the SSH jump pod. If for any reason the
|
|
512
|
-
# service is missing, we should raise an error.
|
|
513
|
-
|
|
514
|
-
kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
|
|
515
|
-
ssh_key_secret_name, namespace, context)
|
|
516
|
-
return config
|
|
517
|
-
|
|
518
|
-
|
|
519
477
|
def _configure_skypilot_system_namespace(
|
|
520
478
|
provider_config: Dict[str, Any]) -> None:
|
|
521
479
|
"""Creates the namespace for skypilot-system mounting if it does not exist.
|
|
@@ -17,7 +17,6 @@ from sky.provision import constants
|
|
|
17
17
|
from sky.provision import docker_utils
|
|
18
18
|
from sky.provision.kubernetes import config as config_lib
|
|
19
19
|
from sky.provision.kubernetes import constants as k8s_constants
|
|
20
|
-
from sky.provision.kubernetes import network_utils
|
|
21
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
22
21
|
from sky.provision.kubernetes import volume
|
|
23
22
|
from sky.utils import command_runner
|
|
@@ -1148,15 +1147,6 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1148
1147
|
if head_pod_name is None and _is_head(pod):
|
|
1149
1148
|
head_pod_name = pod.metadata.name
|
|
1150
1149
|
|
|
1151
|
-
networking_mode = network_utils.get_networking_mode(
|
|
1152
|
-
config.provider_config.get('networking_mode'), context)
|
|
1153
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
1154
|
-
# Adding the jump pod to the new_nodes list as well so it can be
|
|
1155
|
-
# checked if it's scheduled and running along with other pods.
|
|
1156
|
-
ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
|
|
1157
|
-
jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
1158
|
-
ssh_jump_pod_name, namespace)
|
|
1159
|
-
pods.append(jump_pod)
|
|
1160
1150
|
provision_timeout = provider_config['timeout']
|
|
1161
1151
|
|
|
1162
1152
|
wait_str = ('indefinitely'
|
|
@@ -1320,18 +1310,6 @@ def terminate_instances(
|
|
|
1320
1310
|
ray_tag_filter(cluster_name_on_cloud),
|
|
1321
1311
|
None)
|
|
1322
1312
|
|
|
1323
|
-
# Clean up the SSH jump pod if in use
|
|
1324
|
-
networking_mode = network_utils.get_networking_mode(
|
|
1325
|
-
provider_config.get('networking_mode'), context)
|
|
1326
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
1327
|
-
pod_name = list(pods.keys())[0]
|
|
1328
|
-
try:
|
|
1329
|
-
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
|
|
1330
|
-
pod_name)
|
|
1331
|
-
except Exception as e: # pylint: disable=broad-except
|
|
1332
|
-
logger.warning('terminate_instances: Error occurred when analyzing '
|
|
1333
|
-
f'SSH Jump pod: {e}')
|
|
1334
|
-
|
|
1335
1313
|
if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
|
|
1336
1314
|
namespace):
|
|
1337
1315
|
# For high availability controllers, terminate the deployment
|
|
@@ -1367,15 +1345,6 @@ def get_cluster_info(
|
|
|
1367
1345
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
|
1368
1346
|
head_pod_name = None
|
|
1369
1347
|
|
|
1370
|
-
port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
|
|
1371
|
-
network_mode_str = skypilot_config.get_effective_region_config(
|
|
1372
|
-
cloud='kubernetes',
|
|
1373
|
-
region=context,
|
|
1374
|
-
keys=('networking_mode',),
|
|
1375
|
-
default_value=port_forward_mode.value)
|
|
1376
|
-
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
|
1377
|
-
network_mode_str)
|
|
1378
|
-
external_ip = kubernetes_utils.get_external_ip(network_mode, context)
|
|
1379
1348
|
port = 22
|
|
1380
1349
|
if not provider_config.get('use_internal_ips', False):
|
|
1381
1350
|
port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
|
|
@@ -1389,8 +1358,7 @@ def get_cluster_info(
|
|
|
1389
1358
|
common.InstanceInfo(
|
|
1390
1359
|
instance_id=pod_name,
|
|
1391
1360
|
internal_ip=internal_ip,
|
|
1392
|
-
external_ip=
|
|
1393
|
-
external_ip),
|
|
1361
|
+
external_ip=None,
|
|
1394
1362
|
ssh_port=port,
|
|
1395
1363
|
tags=pod.metadata.labels,
|
|
1396
1364
|
)
|
|
@@ -23,8 +23,7 @@ spec:
|
|
|
23
23
|
effect: NoExecute
|
|
24
24
|
containers:
|
|
25
25
|
- name: server
|
|
26
|
-
|
|
27
|
-
image: berkeleyskypilot/fusermount-server:latest
|
|
26
|
+
image: berkeleyskypilot/fusermount-server:0.2.1
|
|
28
27
|
securityContext:
|
|
29
28
|
privileged: true
|
|
30
29
|
volumeMounts:
|
|
@@ -55,27 +55,6 @@ def get_port_mode(
|
|
|
55
55
|
return port_mode
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
def get_networking_mode(
|
|
59
|
-
mode_str: Optional[str],
|
|
60
|
-
context: Optional[str],
|
|
61
|
-
) -> kubernetes_enums.KubernetesNetworkingMode:
|
|
62
|
-
"""Get the networking mode from the provider config."""
|
|
63
|
-
mode_str = mode_str or skypilot_config.get_effective_region_config(
|
|
64
|
-
cloud='kubernetes',
|
|
65
|
-
region=context,
|
|
66
|
-
keys=('networking_mode',),
|
|
67
|
-
default_value=kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.
|
|
68
|
-
value)
|
|
69
|
-
try:
|
|
70
|
-
networking_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
|
71
|
-
mode_str)
|
|
72
|
-
except ValueError as e:
|
|
73
|
-
with ux_utils.print_exception_no_traceback():
|
|
74
|
-
raise ValueError(str(e) +
|
|
75
|
-
' Please check: ~/.sky/config.yaml.') from None
|
|
76
|
-
return networking_mode
|
|
77
|
-
|
|
78
|
-
|
|
79
58
|
def fill_loadbalancer_template(namespace: str, context: Optional[str],
|
|
80
59
|
service_name: str, ports: List[int],
|
|
81
60
|
selector_key: str, selector_value: str) -> Dict:
|