skypilot-nightly 1.0.0.dev20251002__py3-none-any.whl → 1.0.0.dev20251004__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -109
  3. sky/backends/cloud_vm_ray_backend.py +42 -27
  4. sky/client/cli/command.py +1 -11
  5. sky/clouds/cudo.py +1 -1
  6. sky/clouds/kubernetes.py +7 -19
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_buildManifest.js +1 -1
  9. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{webpack-7340bc0f0dd8ae74.js → webpack-3286453d56f3c0a0.js} +1 -1
  13. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  14. sky/dashboard/out/clusters/[cluster].html +1 -1
  15. sky/dashboard/out/clusters.html +1 -1
  16. sky/dashboard/out/config.html +1 -1
  17. sky/dashboard/out/index.html +1 -1
  18. sky/dashboard/out/infra/[context].html +1 -1
  19. sky/dashboard/out/infra.html +1 -1
  20. sky/dashboard/out/jobs/[job].html +1 -1
  21. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  22. sky/dashboard/out/jobs.html +1 -1
  23. sky/dashboard/out/users.html +1 -1
  24. sky/dashboard/out/volumes.html +1 -1
  25. sky/dashboard/out/workspace/new.html +1 -1
  26. sky/dashboard/out/workspaces/[name].html +1 -1
  27. sky/dashboard/out/workspaces.html +1 -1
  28. sky/data/storage_utils.py +9 -0
  29. sky/execution.py +24 -2
  30. sky/global_user_state.py +16 -0
  31. sky/jobs/recovery_strategy.py +45 -0
  32. sky/jobs/server/core.py +60 -53
  33. sky/jobs/state.py +21 -1
  34. sky/jobs/utils.py +29 -11
  35. sky/provision/kubernetes/config.py +0 -42
  36. sky/provision/kubernetes/instance.py +1 -33
  37. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  38. sky/provision/kubernetes/network_utils.py +0 -21
  39. sky/provision/kubernetes/utils.py +136 -300
  40. sky/server/auth/loopback.py +38 -0
  41. sky/server/auth/oauth2_proxy.py +6 -0
  42. sky/server/server.py +6 -0
  43. sky/setup_files/dependencies.py +1 -0
  44. sky/templates/kubernetes-ray.yml.j2 +4 -13
  45. sky/utils/context.py +12 -7
  46. sky/utils/env_options.py +4 -0
  47. sky/utils/kubernetes_enums.py +2 -15
  48. sky/utils/schemas.py +17 -6
  49. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/METADATA +38 -37
  50. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/RECORD +55 -56
  51. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
  52. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
  53. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  54. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  55. /sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ resources:
7
7
  """
8
8
  import asyncio
9
9
  import logging
10
+ import os
10
11
  import traceback
11
12
  import typing
12
13
  from typing import Optional, Set
@@ -16,16 +17,19 @@ from sky import dag as dag_lib
16
17
  from sky import exceptions
17
18
  from sky import global_user_state
18
19
  from sky import sky_logging
20
+ from sky import skypilot_config
19
21
  from sky.backends import backend_utils
20
22
  from sky.client import sdk
21
23
  from sky.jobs import scheduler
22
24
  from sky.jobs import state
23
25
  from sky.jobs import utils as managed_job_utils
24
26
  from sky.serve import serve_utils
27
+ from sky.skylet import constants
25
28
  from sky.skylet import job_lib
26
29
  from sky.usage import usage_lib
27
30
  from sky.utils import common_utils
28
31
  from sky.utils import context_utils
32
+ from sky.utils import env_options
29
33
  from sky.utils import registry
30
34
  from sky.utils import status_lib
31
35
  from sky.utils import ux_utils
@@ -45,6 +49,13 @@ MAX_JOB_CHECKING_RETRY = 10
45
49
  # cluster before its status can be updated by the job controller.
46
50
  _AUTODOWN_MINUTES = 10
47
51
 
52
+ ENV_VARS_TO_CLEAR = [
53
+ skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
54
+ constants.USER_ID_ENV_VAR,
55
+ constants.USER_ENV_VAR,
56
+ env_options.Options.SHOW_DEBUG_INFO.env_key,
57
+ ]
58
+
48
59
 
49
60
  class StrategyExecutor:
50
61
  """Handle the launching, recovery and termination of managed job clusters"""
@@ -213,6 +224,7 @@ class StrategyExecutor:
213
224
  **kwargs,
214
225
  _try_cancel_if_cluster_is_init=True,
215
226
  )
227
+ self._logger.debug(f'sdk.cancel request ID: {request_id}')
216
228
  await context_utils.to_thread(
217
229
  sdk.get,
218
230
  request_id,
@@ -371,6 +383,31 @@ class StrategyExecutor:
371
383
  usage_lib.messages.usage.set_internal()
372
384
  if self.pool is None:
373
385
  assert self.cluster_name is not None
386
+
387
+ # sdk.launch will implicitly start the API server,
388
+ # but then the API server will inherit the current
389
+ # env vars/user, which we may not want.
390
+ # Instead, clear env vars here and call api_start
391
+ # explicitly.
392
+ vars_to_restore = {}
393
+ try:
394
+ for env_var in ENV_VARS_TO_CLEAR:
395
+ vars_to_restore[env_var] = os.environ.pop(
396
+ env_var, None)
397
+ self._logger.debug('Cleared env var: '
398
+ f'{env_var}')
399
+ self._logger.debug('Env vars for api_start: '
400
+ f'{os.environ}')
401
+ await context_utils.to_thread(sdk.api_start)
402
+ self._logger.info('API server started.')
403
+ finally:
404
+ for env_var, value in vars_to_restore.items():
405
+ if value is not None:
406
+ self._logger.debug(
407
+ 'Restored env var: '
408
+ f'{env_var}: {value}')
409
+ os.environ[env_var] = value
410
+
374
411
  log_file = _get_logger_file(self._logger)
375
412
  request_id = None
376
413
  try:
@@ -392,6 +429,8 @@ class StrategyExecutor:
392
429
  # down=True,
393
430
  _is_launched_by_jobs_controller=True,
394
431
  )
432
+ self._logger.debug('sdk.launch request ID: '
433
+ f'{request_id}')
395
434
  if log_file is None:
396
435
  raise OSError('Log file is None')
397
436
  with open(log_file, 'a', encoding='utf-8') as f:
@@ -404,6 +443,8 @@ class StrategyExecutor:
404
443
  if request_id:
405
444
  req = await context_utils.to_thread(
406
445
  sdk.api_cancel, request_id)
446
+ self._logger.debug('sdk.api_cancel request '
447
+ f'ID: {req}')
407
448
  try:
408
449
  await context_utils.to_thread(
409
450
  sdk.get, req)
@@ -427,6 +468,8 @@ class StrategyExecutor:
427
468
  self.dag,
428
469
  cluster_name=self.cluster_name,
429
470
  )
471
+ self._logger.debug('sdk.exec request ID: '
472
+ f'{request_id}')
430
473
  job_id_on_pool_cluster, _ = (
431
474
  await context_utils.to_thread(
432
475
  sdk.get, request_id))
@@ -434,6 +477,8 @@ class StrategyExecutor:
434
477
  if request_id:
435
478
  req = await context_utils.to_thread(
436
479
  sdk.api_cancel, request_id)
480
+ self._logger.debug('sdk.api_cancel request '
481
+ f'ID: {req}')
437
482
  try:
438
483
  await context_utils.to_thread(
439
484
  sdk.get, req)
sky/jobs/server/core.py CHANGED
@@ -27,6 +27,7 @@ from sky.data import storage as storage_lib
27
27
  from sky.jobs import constants as managed_job_constants
28
28
  from sky.jobs import state as managed_job_state
29
29
  from sky.jobs import utils as managed_job_utils
30
+ from sky.metrics import utils as metrics_lib
30
31
  from sky.provision import common as provision_common
31
32
  from sky.schemas.api import responses
32
33
  from sky.serve import serve_state
@@ -666,6 +667,7 @@ def queue_v2_api(
666
667
  ], total, status_counts, total_no_filter
667
668
 
668
669
 
670
+ @metrics_lib.time_me
669
671
  def queue_v2(
670
672
  refresh: bool,
671
673
  skip_finished: bool = False,
@@ -723,11 +725,12 @@ def queue_v2(
723
725
  if page is not None:
724
726
  raise ValueError('Limit must be specified when page is specified')
725
727
 
726
- handle = _maybe_restart_controller(refresh,
727
- stopped_message='No in-progress '
728
- 'managed jobs.',
729
- spinner_message='Checking '
730
- 'managed jobs')
728
+ with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
729
+ handle = _maybe_restart_controller(refresh,
730
+ stopped_message='No in-progress '
731
+ 'managed jobs.',
732
+ spinner_message='Checking '
733
+ 'managed jobs')
731
734
  backend = backend_utils.get_backend_from_handle(handle)
732
735
  assert isinstance(backend, backends.CloudVmRayBackend)
733
736
 
@@ -778,70 +781,74 @@ def queue_v2(
778
781
  except exceptions.SkyletMethodNotImplementedError:
779
782
  pass
780
783
 
781
- code = managed_job_utils.ManagedJobCodeGen.get_job_table(
782
- skip_finished, accessible_workspaces, job_ids, workspace_match,
783
- name_match, pool_match, page, limit, user_hashes, statuses)
784
- returncode, job_table_payload, stderr = backend.run_on_head(
785
- handle,
786
- code,
787
- require_outputs=True,
788
- stream_logs=False,
789
- separate_stderr=True)
784
+ with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
785
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
786
+ skip_finished, accessible_workspaces, job_ids, workspace_match,
787
+ name_match, pool_match, page, limit, user_hashes, statuses)
788
+ with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
789
+ returncode, job_table_payload, stderr = backend.run_on_head(
790
+ handle,
791
+ code,
792
+ require_outputs=True,
793
+ stream_logs=False,
794
+ separate_stderr=True)
790
795
 
791
796
  if returncode != 0:
792
797
  logger.error(job_table_payload + stderr)
793
798
  raise RuntimeError('Failed to fetch managed jobs with returncode: '
794
799
  f'{returncode}.\n{job_table_payload + stderr}')
795
800
 
796
- (jobs, total, result_type, total_no_filter, status_counts
797
- ) = managed_job_utils.load_managed_job_queue(job_table_payload)
801
+ with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
802
+ (jobs, total, result_type, total_no_filter, status_counts
803
+ ) = managed_job_utils.load_managed_job_queue(job_table_payload)
798
804
 
799
805
  if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
800
806
  return jobs, total, status_counts, total_no_filter
801
807
 
802
808
  # Backward compatibility for old jobs controller without filtering
803
809
  # TODO(hailong): remove this after 0.12.0
804
- if not all_users:
805
-
806
- def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
807
- user_hash = job.get('user_hash', None)
808
- if user_hash is None:
809
- # For backwards compatibility, we show jobs that do not have a
810
- # user_hash. TODO(cooperc): Remove before 0.12.0.
811
- return True
812
- return user_hash == common_utils.get_user_hash()
810
+ with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
811
+ if not all_users:
813
812
 
814
- jobs = list(filter(user_hash_matches_or_missing, jobs))
813
+ def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
814
+ user_hash = job.get('user_hash', None)
815
+ if user_hash is None:
816
+ # For backwards compatibility, we show jobs that do not have
817
+ # a user_hash. TODO(cooperc): Remove before 0.12.0.
818
+ return True
819
+ return user_hash == common_utils.get_user_hash()
815
820
 
816
- jobs = list(
817
- filter(
818
- lambda job: job.get('workspace', skylet_constants.
819
- SKYPILOT_DEFAULT_WORKSPACE) in
820
- accessible_workspaces, jobs))
821
+ jobs = list(filter(user_hash_matches_or_missing, jobs))
821
822
 
822
- if skip_finished:
823
- # Filter out the finished jobs. If a multi-task job is partially
824
- # finished, we will include all its tasks.
825
- non_finished_tasks = list(
826
- filter(lambda job: not job['status'].is_terminal(), jobs))
827
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
828
823
  jobs = list(
829
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
830
-
831
- if job_ids:
832
- jobs = [job for job in jobs if job['job_id'] in job_ids]
833
-
834
- filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
835
- jobs,
836
- workspace_match,
837
- name_match,
838
- pool_match,
839
- page=page,
840
- limit=limit,
841
- user_match=user_match,
842
- enable_user_match=True,
843
- statuses=statuses,
844
- )
824
+ filter(
825
+ lambda job: job.get('workspace', skylet_constants.
826
+ SKYPILOT_DEFAULT_WORKSPACE) in
827
+ accessible_workspaces, jobs))
828
+
829
+ if skip_finished:
830
+ # Filter out the finished jobs. If a multi-task job is partially
831
+ # finished, we will include all its tasks.
832
+ non_finished_tasks = list(
833
+ filter(lambda job: not job['status'].is_terminal(), jobs))
834
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
835
+ jobs = list(
836
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
837
+
838
+ if job_ids:
839
+ jobs = [job for job in jobs if job['job_id'] in job_ids]
840
+
841
+ filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
842
+ jobs,
843
+ workspace_match,
844
+ name_match,
845
+ pool_match,
846
+ page=page,
847
+ limit=limit,
848
+ user_match=user_match,
849
+ enable_user_match=True,
850
+ statuses=statuses,
851
+ )
845
852
  return filtered_jobs, total, status_counts, total_no_filter
846
853
 
847
854
 
sky/jobs/state.py CHANGED
@@ -10,7 +10,8 @@ import sqlite3
10
10
  import threading
11
11
  import time
12
12
  import typing
13
- from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
13
+ from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
14
+ Union)
14
15
  import urllib.parse
15
16
 
16
17
  import colorama
@@ -1250,6 +1251,25 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
1250
1251
  return pool[0] if pool else None
1251
1252
 
1252
1253
 
1254
+ @_init_db
1255
+ def get_pool_and_submit_info_from_job_ids(
1256
+ job_ids: Set[int]
1257
+ ) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
1258
+ """Get the pool, cluster name, and job id on pool from job id"""
1259
+ assert _SQLALCHEMY_ENGINE is not None
1260
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1261
+ rows = session.execute(
1262
+ sqlalchemy.select(
1263
+ job_info_table.c.spot_job_id, job_info_table.c.pool,
1264
+ job_info_table.c.current_cluster_name,
1265
+ job_info_table.c.job_id_on_pool_cluster).where(
1266
+ job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
1267
+ return {
1268
+ job_id: (pool, cluster_name, job_id_on_pool_cluster)
1269
+ for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
1270
+ }
1271
+
1272
+
1253
1273
  @_init_db
1254
1274
  def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
1255
1275
  """Set the current cluster name for a job."""
sky/jobs/utils.py CHANGED
@@ -1325,6 +1325,23 @@ def get_managed_job_queue(
1325
1325
  page,
1326
1326
  limit,
1327
1327
  statuses=statuses)
1328
+
1329
+ job_ids = set(job['job_id'] for job in jobs)
1330
+ job_id_to_pool_info = (
1331
+ managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
1332
+ cluster_names: Dict[int, str] = {}
1333
+ for job in jobs:
1334
+ # pool info is (pool, cluster_name, job_id_on_pool_cluster)
1335
+ pool_info = job_id_to_pool_info.get(job['job_id'], None)
1336
+ if pool_info and pool_info[0]:
1337
+ cluster_name = pool_info[1]
1338
+ else:
1339
+ cluster_name = generate_managed_job_cluster_name(
1340
+ job['task_name'], job['job_id'])
1341
+ cluster_names[job['job_id']] = cluster_name
1342
+ cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
1343
+ set(cluster_names.values()))
1344
+
1328
1345
  for job in jobs:
1329
1346
  end_at = job['end_at']
1330
1347
  if end_at is None:
@@ -1344,15 +1361,8 @@ def get_managed_job_queue(
1344
1361
  job['status'] = job['status'].value
1345
1362
  job['schedule_state'] = job['schedule_state'].value
1346
1363
 
1347
- pool = managed_job_state.get_pool_from_job_id(job['job_id'])
1348
- if pool is not None:
1349
- cluster_name, _ = managed_job_state.get_pool_submit_info(
1350
- job['job_id'])
1351
- else:
1352
- cluster_name = generate_managed_job_cluster_name(
1353
- job['task_name'], job['job_id'])
1354
- handle = global_user_state.get_handle_from_cluster_name(
1355
- cluster_name) if cluster_name is not None else None
1364
+ cluster_name = cluster_names[job['job_id']]
1365
+ handle = cluster_name_to_handles.get(cluster_name, None)
1356
1366
  if isinstance(handle, backends.CloudVmRayResourceHandle):
1357
1367
  resources_str = resources_utils.get_readable_resources_repr(
1358
1368
  handle, simplify=True)
@@ -1507,12 +1517,20 @@ def load_managed_job_queue(
1507
1517
  total_no_filter = total
1508
1518
  result_type = ManagedJobQueueResultType.LIST
1509
1519
 
1520
+ job_id_to_user_hash: Dict[int, str] = {}
1510
1521
  for job in jobs:
1511
- job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1512
1522
  if 'user_hash' in job and job['user_hash'] is not None:
1513
1523
  # Skip jobs that do not have user_hash info.
1514
1524
  # TODO(cooperc): Remove check before 0.12.0.
1515
- user = global_user_state.get_user(job['user_hash'])
1525
+ job_id_to_user_hash[job['job_id']] = job['user_hash']
1526
+ user_hash_to_user = global_user_state.get_users(
1527
+ job_id_to_user_hash.values())
1528
+
1529
+ for job in jobs:
1530
+ job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1531
+ if job['job_id'] in job_id_to_user_hash:
1532
+ user_hash = job_id_to_user_hash[job['job_id']]
1533
+ user = user_hash_to_user.get(user_hash, None)
1516
1534
  job['user_name'] = user.name if user is not None else None
1517
1535
  return jobs, total, result_type, total_no_filter, status_counts
1518
1536
 
@@ -7,9 +7,7 @@ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from sky.adaptors import kubernetes
9
9
  from sky.provision import common
10
- from sky.provision.kubernetes import network_utils
11
10
  from sky.provision.kubernetes import utils as kubernetes_utils
12
- from sky.utils import kubernetes_enums
13
11
  from sky.utils import yaml_utils
14
12
 
15
13
  logger = logging.getLogger(__name__)
@@ -28,11 +26,6 @@ def bootstrap_instances(
28
26
 
29
27
  _configure_services(namespace, context, config.provider_config)
30
28
 
31
- networking_mode = network_utils.get_networking_mode(
32
- config.provider_config.get('networking_mode'), context)
33
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
34
- config = _configure_ssh_jump(namespace, context, config)
35
-
36
29
  requested_service_account = config.node_config['spec']['serviceAccountName']
37
30
  if (requested_service_account ==
38
31
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
@@ -481,41 +474,6 @@ def _configure_autoscaler_cluster_role_binding(
481
474
  f'{created_msg(binding_field, name)}')
482
475
 
483
476
 
484
- def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
485
- """Creates a SSH jump pod to connect to the cluster.
486
-
487
- Also updates config['auth']['ssh_proxy_command'] to use the newly created
488
- jump pod.
489
- """
490
- provider_config = config.provider_config
491
- pod_cfg = config.node_config
492
-
493
- ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
494
- ssh_jump_image = provider_config['ssh_jump_image']
495
-
496
- volumes = pod_cfg['spec']['volumes']
497
- # find 'secret-volume' and get the secret name
498
- secret_volume = next(filter(lambda x: x['name'] == 'secret-volume',
499
- volumes))
500
- ssh_key_secret_name = secret_volume['secret']['secretName']
501
-
502
- # TODO(romilb): We currently split SSH jump pod and svc creation. Service
503
- # is first created in authentication.py::setup_kubernetes_authentication
504
- # and then SSH jump pod creation happens here. This is because we need to
505
- # set the ssh_proxy_command in the ray YAML before we pass it to the
506
- # autoscaler. If in the future if we can write the ssh_proxy_command to the
507
- # cluster yaml through this method, then we should move the service
508
- # creation here.
509
-
510
- # TODO(romilb): We should add a check here to make sure the service is up
511
- # and available before we create the SSH jump pod. If for any reason the
512
- # service is missing, we should raise an error.
513
-
514
- kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
515
- ssh_key_secret_name, namespace, context)
516
- return config
517
-
518
-
519
477
  def _configure_skypilot_system_namespace(
520
478
  provider_config: Dict[str, Any]) -> None:
521
479
  """Creates the namespace for skypilot-system mounting if it does not exist.
@@ -17,7 +17,6 @@ from sky.provision import constants
17
17
  from sky.provision import docker_utils
18
18
  from sky.provision.kubernetes import config as config_lib
19
19
  from sky.provision.kubernetes import constants as k8s_constants
20
- from sky.provision.kubernetes import network_utils
21
20
  from sky.provision.kubernetes import utils as kubernetes_utils
22
21
  from sky.provision.kubernetes import volume
23
22
  from sky.utils import command_runner
@@ -1148,15 +1147,6 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
1148
1147
  if head_pod_name is None and _is_head(pod):
1149
1148
  head_pod_name = pod.metadata.name
1150
1149
 
1151
- networking_mode = network_utils.get_networking_mode(
1152
- config.provider_config.get('networking_mode'), context)
1153
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
1154
- # Adding the jump pod to the new_nodes list as well so it can be
1155
- # checked if it's scheduled and running along with other pods.
1156
- ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
1157
- jump_pod = kubernetes.core_api(context).read_namespaced_pod(
1158
- ssh_jump_pod_name, namespace)
1159
- pods.append(jump_pod)
1160
1150
  provision_timeout = provider_config['timeout']
1161
1151
 
1162
1152
  wait_str = ('indefinitely'
@@ -1320,18 +1310,6 @@ def terminate_instances(
1320
1310
  ray_tag_filter(cluster_name_on_cloud),
1321
1311
  None)
1322
1312
 
1323
- # Clean up the SSH jump pod if in use
1324
- networking_mode = network_utils.get_networking_mode(
1325
- provider_config.get('networking_mode'), context)
1326
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
1327
- pod_name = list(pods.keys())[0]
1328
- try:
1329
- kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
1330
- pod_name)
1331
- except Exception as e: # pylint: disable=broad-except
1332
- logger.warning('terminate_instances: Error occurred when analyzing '
1333
- f'SSH Jump pod: {e}')
1334
-
1335
1313
  if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
1336
1314
  namespace):
1337
1315
  # For high availability controllers, terminate the deployment
@@ -1367,15 +1345,6 @@ def get_cluster_info(
1367
1345
  pods: Dict[str, List[common.InstanceInfo]] = {}
1368
1346
  head_pod_name = None
1369
1347
 
1370
- port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
1371
- network_mode_str = skypilot_config.get_effective_region_config(
1372
- cloud='kubernetes',
1373
- region=context,
1374
- keys=('networking_mode',),
1375
- default_value=port_forward_mode.value)
1376
- network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
1377
- network_mode_str)
1378
- external_ip = kubernetes_utils.get_external_ip(network_mode, context)
1379
1348
  port = 22
1380
1349
  if not provider_config.get('use_internal_ips', False):
1381
1350
  port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
@@ -1389,8 +1358,7 @@ def get_cluster_info(
1389
1358
  common.InstanceInfo(
1390
1359
  instance_id=pod_name,
1391
1360
  internal_ip=internal_ip,
1392
- external_ip=(None if network_mode == port_forward_mode else
1393
- external_ip),
1361
+ external_ip=None,
1394
1362
  ssh_port=port,
1395
1363
  tags=pod.metadata.labels,
1396
1364
  )
@@ -23,8 +23,7 @@ spec:
23
23
  effect: NoExecute
24
24
  containers:
25
25
  - name: server
26
- # TODO(aylei): version strategy of our addon images
27
- image: berkeleyskypilot/fusermount-server:latest
26
+ image: berkeleyskypilot/fusermount-server:0.2.1
28
27
  securityContext:
29
28
  privileged: true
30
29
  volumeMounts:
@@ -55,27 +55,6 @@ def get_port_mode(
55
55
  return port_mode
56
56
 
57
57
 
58
- def get_networking_mode(
59
- mode_str: Optional[str],
60
- context: Optional[str],
61
- ) -> kubernetes_enums.KubernetesNetworkingMode:
62
- """Get the networking mode from the provider config."""
63
- mode_str = mode_str or skypilot_config.get_effective_region_config(
64
- cloud='kubernetes',
65
- region=context,
66
- keys=('networking_mode',),
67
- default_value=kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.
68
- value)
69
- try:
70
- networking_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
71
- mode_str)
72
- except ValueError as e:
73
- with ux_utils.print_exception_no_traceback():
74
- raise ValueError(str(e) +
75
- ' Please check: ~/.sky/config.yaml.') from None
76
- return networking_mode
77
-
78
-
79
58
  def fill_loadbalancer_template(namespace: str, context: Optional[str],
80
59
  service_name: str, ports: List[int],
81
60
  selector_key: str, selector_value: str) -> Dict: