skypilot-nightly 1.0.0.dev20251004__py3-none-any.whl → 1.0.0.dev20251008__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +17 -21
  3. sky/backends/backend.py +1 -3
  4. sky/backends/cloud_vm_ray_backend.py +8 -20
  5. sky/backends/local_docker_backend.py +0 -5
  6. sky/client/sdk.py +24 -23
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/execution.py +1 -11
  24. sky/global_user_state.py +16 -5
  25. sky/jobs/constants.py +1 -7
  26. sky/jobs/controller.py +9 -1
  27. sky/jobs/scheduler.py +30 -15
  28. sky/jobs/server/core.py +8 -3
  29. sky/jobs/utils.py +30 -2
  30. sky/metrics/utils.py +62 -45
  31. sky/provision/instance_setup.py +32 -10
  32. sky/provision/kubernetes/utils.py +4 -1
  33. sky/provision/provisioner.py +10 -7
  34. sky/schemas/api/responses.py +2 -2
  35. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  36. sky/server/common.py +1 -0
  37. sky/server/config.py +2 -0
  38. sky/server/metrics.py +3 -1
  39. sky/server/requests/executor.py +103 -77
  40. sky/server/requests/requests.py +26 -11
  41. sky/server/server.py +16 -0
  42. sky/skylet/constants.py +9 -1
  43. sky/skylet/events.py +17 -0
  44. sky/skylet/skylet.py +3 -0
  45. sky/skypilot_config.py +2 -1
  46. sky/templates/kubernetes-ray.yml.j2 +5 -0
  47. sky/utils/context_utils.py +5 -1
  48. sky/utils/controller_utils.py +14 -0
  49. sky/utils/db/db_utils.py +2 -0
  50. sky/utils/db/migration_utils.py +11 -2
  51. sky/volumes/server/server.py +2 -2
  52. {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/METADATA +37 -37
  53. {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/RECORD +59 -58
  54. /sky/dashboard/out/_next/static/{KL03GEega4QqDqTOMtA_w → MnvNdzHHpiZG1_oKSpbxF}/_buildManifest.js +0 -0
  55. /sky/dashboard/out/_next/static/{KL03GEega4QqDqTOMtA_w → MnvNdzHHpiZG1_oKSpbxF}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/top_level.txt +0 -0
sky/execution.py CHANGED
@@ -112,7 +112,6 @@ def _execute(
112
112
  stages: Optional[List[Stage]] = None,
113
113
  cluster_name: Optional[str] = None,
114
114
  detach_setup: bool = False,
115
- detach_run: bool = False,
116
115
  idle_minutes_to_autostop: Optional[int] = None,
117
116
  no_setup: bool = False,
118
117
  clone_disk_from: Optional[str] = None,
@@ -157,8 +156,6 @@ def _execute(
157
156
  job itself. You can safely ctrl-c to detach from logging, and it will
158
157
  not interrupt the setup process. To see the logs again after detaching,
159
158
  use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
160
- detach_run: If True, as soon as a job is submitted, return from this
161
- function and do not stream execution logs.
162
159
  idle_minutes_to_autostop: int; if provided, the cluster will be set to
163
160
  autostop after this many minutes of idleness.
164
161
  no_setup: bool; whether to skip setup commands or not when (re-)launching.
@@ -217,7 +214,6 @@ def _execute(
217
214
  stages=stages,
218
215
  cluster_name=cluster_name,
219
216
  detach_setup=detach_setup,
220
- detach_run=detach_run,
221
217
  no_setup=no_setup,
222
218
  clone_disk_from=clone_disk_from,
223
219
  skip_unnecessary_provisioning=skip_unnecessary_provisioning,
@@ -239,7 +235,6 @@ def _execute_dag(
239
235
  stages: Optional[List[Stage]],
240
236
  cluster_name: Optional[str],
241
237
  detach_setup: bool,
242
- detach_run: bool,
243
238
  no_setup: bool,
244
239
  clone_disk_from: Optional[str],
245
240
  skip_unnecessary_provisioning: bool,
@@ -507,10 +502,7 @@ def _execute_dag(
507
502
  if Stage.EXEC in stages:
508
503
  try:
509
504
  global_user_state.update_last_use(handle.get_cluster_name())
510
- job_id = backend.execute(handle,
511
- task,
512
- detach_run,
513
- dryrun=dryrun)
505
+ job_id = backend.execute(handle, task, dryrun=dryrun)
514
506
  finally:
515
507
  # Enables post_execute() to be run after KeyboardInterrupt.
516
508
  backend.post_execute(handle, down)
@@ -707,7 +699,6 @@ def launch(
707
699
  stages=stages,
708
700
  cluster_name=cluster_name,
709
701
  detach_setup=detach_setup,
710
- detach_run=True,
711
702
  idle_minutes_to_autostop=idle_minutes_to_autostop,
712
703
  no_setup=no_setup,
713
704
  clone_disk_from=clone_disk_from,
@@ -802,6 +793,5 @@ def exec( # pylint: disable=redefined-builtin
802
793
  Stage.EXEC,
803
794
  ],
804
795
  cluster_name=cluster_name,
805
- detach_run=True,
806
796
  job_logger=job_logger,
807
797
  )
sky/global_user_state.py CHANGED
@@ -2495,11 +2495,22 @@ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
2495
2495
  # on the local file system and migrate it to the database.
2496
2496
  # TODO(syang): remove this check once we have a way to migrate the
2497
2497
  # cluster from file to database. Remove on v0.12.0.
2498
- if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
2499
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2500
- yaml_str = f.read()
2501
- set_cluster_yaml(cluster_name, yaml_str)
2502
- return yaml_str
2498
+ if cluster_yaml_path is not None:
2499
+ # First try the exact path
2500
+ path_to_read = None
2501
+ if os.path.exists(cluster_yaml_path):
2502
+ path_to_read = cluster_yaml_path
2503
+ # Fallback: try with .debug suffix (when debug logging was enabled)
2504
+ # Debug logging causes YAML files to be saved with .debug suffix
2505
+ # but the path stored in the handle doesn't include it
2506
+ debug_path = cluster_yaml_path + '.debug'
2507
+ if os.path.exists(debug_path):
2508
+ path_to_read = debug_path
2509
+ if path_to_read is not None:
2510
+ with open(path_to_read, 'r', encoding='utf-8') as f:
2511
+ yaml_str = f.read()
2512
+ set_cluster_yaml(cluster_name, yaml_str)
2513
+ return yaml_str
2503
2514
  return None
2504
2515
 
2505
2516
 
sky/jobs/constants.py CHANGED
@@ -15,16 +15,10 @@ JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
15
15
  CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
16
16
  SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
17
17
  # Resources as a dict for the jobs controller.
18
- # Use smaller CPU instance type for jobs controller, but with more memory, i.e.
19
- # r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
20
- # and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
21
- # Concurrently limits are set based on profiling. 4x num vCPUs is the launch
22
- # parallelism limit, and memory / 350MB is the limit to concurrently running
23
- # jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
24
18
  # We use 50 GB disk size to reduce the cost.
25
19
  CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
26
20
  'cpus': '4+',
27
- 'memory': '8x',
21
+ 'memory': '4x',
28
22
  'disk_size': 50
29
23
  }
30
24
 
sky/jobs/controller.py CHANGED
@@ -1144,7 +1144,15 @@ class Controller:
1144
1144
  await asyncio.sleep(30)
1145
1145
  continue
1146
1146
 
1147
- if len(running_tasks) >= scheduler.JOBS_PER_WORKER:
1147
+ # Normally, 200 jobs can run on each controller. But if we have a
1148
+ # ton of controllers, we need to limit the number of jobs that can
1149
+ # run on each controller, to achieve a total of 2000 jobs across all
1150
+ # controllers.
1151
+ max_jobs = min(scheduler.MAX_JOBS_PER_WORKER,
1152
+ (scheduler.MAX_TOTAL_RUNNING_JOBS //
1153
+ scheduler.get_number_of_controllers()))
1154
+
1155
+ if len(running_tasks) >= max_jobs:
1148
1156
  await asyncio.sleep(60)
1149
1157
  continue
1150
1158
 
sky/jobs/scheduler.py CHANGED
@@ -63,7 +63,9 @@ from sky.jobs import state
63
63
  from sky.jobs import utils as managed_job_utils
64
64
  from sky.server import config as server_config
65
65
  from sky.skylet import constants
66
+ from sky.utils import annotations
66
67
  from sky.utils import common_utils
68
+ from sky.utils import controller_utils
67
69
  from sky.utils import subprocess_utils
68
70
 
69
71
  if typing.TYPE_CHECKING:
@@ -91,20 +93,29 @@ JOB_MEMORY_MB = 400
91
93
  LAUNCHES_PER_WORKER = 8
92
94
  # this can probably be increased to around 300-400 but keeping it lower to just
93
95
  # to be safe
94
- JOBS_PER_WORKER = 200
95
-
96
- # keep 1GB reserved after the controllers
97
- MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
98
-
99
- CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
100
-
96
+ MAX_JOBS_PER_WORKER = 200
97
+ # Maximum number of controllers that can be running. Hard to handle more than
98
+ # 512 launches at once.
99
+ MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
100
+ # Limit the number of jobs that can be running at once on the entire jobs
101
+ # controller cluster. It's hard to handle cancellation of more than 2000 jobs at
102
+ # once.
103
+ # TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
104
+ # hardcoded max limit.
105
+ MAX_TOTAL_RUNNING_JOBS = 2000
101
106
  # Maximum values for above constants. There will start to be lagging issues
102
107
  # at these numbers already.
103
108
  # JOB_MEMORY_MB = 200
104
109
  # LAUNCHES_PER_WORKER = 16
105
110
  # JOBS_PER_WORKER = 400
106
111
 
112
+ # keep 2GB reserved after the controllers
113
+ MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
114
+
115
+ CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
116
+
107
117
 
118
+ @annotations.lru_cache(scope='global')
108
119
  def get_number_of_controllers() -> int:
109
120
  """Returns the number of controllers that should be running.
110
121
 
@@ -123,7 +134,7 @@ def get_number_of_controllers() -> int:
123
134
  consolidation_mode = skypilot_config.get_nested(
124
135
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
125
136
 
126
- total_memory_mb = common_utils.get_mem_size_gb() * 1024
137
+ total_memory_mb = controller_utils.get_controller_mem_size_gb() * 1024
127
138
  if consolidation_mode:
128
139
  config = server_config.compute_server_config(deploy=True, quiet=True)
129
140
 
@@ -136,13 +147,16 @@ def get_number_of_controllers() -> int:
136
147
  config.short_worker_config.burstable_parallelism) * \
137
148
  server_config.SHORT_WORKER_MEM_GB * 1024
138
149
 
139
- return max(1, int((total_memory_mb - used) // JOB_MEMORY_MB))
150
+ return min(MAX_CONTROLLERS,
151
+ max(1, int((total_memory_mb - used) // JOB_MEMORY_MB)))
140
152
  else:
141
- return max(
142
- 1,
143
- int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
144
- ((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) * 1024
145
- + JOB_MEMORY_MB)))
153
+ return min(
154
+ MAX_CONTROLLERS,
155
+ max(
156
+ 1,
157
+ int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
158
+ ((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) *
159
+ 1024 + JOB_MEMORY_MB))))
146
160
 
147
161
 
148
162
  def start_controller() -> None:
@@ -280,7 +294,8 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
280
294
  common_utils.get_user_hash(), priority)
281
295
  if state.get_ha_recovery_script(job_id) is None:
282
296
  # the run command is just the command that called scheduler
283
- run = (f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
297
+ run = (f'source {env_file_path} && '
298
+ f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
284
299
  f'--job-id {job_id} --env-file {env_file_path} '
285
300
  f'--user-yaml-path {original_user_yaml_path} '
286
301
  f'--priority {priority}')
sky/jobs/server/core.py CHANGED
@@ -407,9 +407,12 @@ def launch(
407
407
  job_identity = ''
408
408
  if job_rank is not None:
409
409
  job_identity = f' (rank: {job_rank})'
410
- logger.info(f'{colorama.Fore.YELLOW}'
411
- f'Launching managed job {dag.name!r}{job_identity} '
412
- f'from jobs controller...{colorama.Style.RESET_ALL}')
410
+ job_controller_postfix = (' from jobs controller' if
411
+ consolidation_mode_job_id is None else '')
412
+ logger.info(
413
+ f'{colorama.Fore.YELLOW}'
414
+ f'Launching managed job {dag.name!r}{job_identity}'
415
+ f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
413
416
 
414
417
  # Launch with the api server's user hash, so that sky status does
415
418
  # not show the owner of the controller as whatever user launched
@@ -456,6 +459,8 @@ def launch(
456
459
  managed_job_state.set_ha_recovery_script(
457
460
  consolidation_mode_job_id, run_script)
458
461
  backend.run_on_head(local_handle, run_script)
462
+ ux_utils.starting_message(
463
+ f'Job submitted, ID: {consolidation_mode_job_id}')
459
464
  return consolidation_mode_job_id, local_handle
460
465
 
461
466
  if pool is None:
sky/jobs/utils.py CHANGED
@@ -11,6 +11,7 @@ import enum
11
11
  import logging
12
12
  import os
13
13
  import pathlib
14
+ import re
14
15
  import shlex
15
16
  import textwrap
16
17
  import time
@@ -299,8 +300,10 @@ async def get_job_status(
299
300
  job_logger.info(f'Job status: {status}')
300
301
  job_logger.info('=' * 34)
301
302
  return status
302
- except (exceptions.CommandError, grpc.RpcError,
303
- grpc.FutureTimeoutError) as e:
303
+ except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
304
+ ValueError, TypeError) as e:
305
+ # Note: Each of these exceptions has some additional conditions to
306
+ # limit how we handle it and whether or not we catch it.
304
307
  # Retry on k8s transient network errors. This is useful when using
305
308
  # coreweave which may have transient network issue sometimes.
306
309
  is_transient_error = False
@@ -319,6 +322,31 @@ async def get_job_status(
319
322
  is_transient_error = True
320
323
  elif isinstance(e, grpc.FutureTimeoutError):
321
324
  detailed_reason = 'Timeout'
325
+ # TODO(cooperc): Gracefully handle these exceptions in the backend.
326
+ elif isinstance(e, ValueError):
327
+ # If the cluster yaml is deleted in the middle of getting the
328
+ # SSH credentials, we could see this. See
329
+ # sky/global_user_state.py get_cluster_yaml_dict.
330
+ if re.search(r'Cluster yaml .* not found', str(e)):
331
+ detailed_reason = 'Cluster yaml was deleted'
332
+ else:
333
+ raise
334
+ elif isinstance(e, TypeError):
335
+ # We will grab the SSH credentials from the cluster yaml, but if
336
+ # handle.cluster_yaml is None, we will just return an empty dict
337
+ # for the credentials. See
338
+ # backend_utils.ssh_credential_from_yaml. Then, the credentials
339
+ # are passed as kwargs to SSHCommandRunner.__init__ - see
340
+ # cloud_vm_ray_backend.get_command_runners. So we can hit this
341
+ # TypeError if the cluster yaml is removed from the handle right
342
+ # when we pull it before the cluster is fully deleted.
343
+ error_msg_to_check = (
344
+ 'SSHCommandRunner.__init__() missing 2 required positional '
345
+ 'arguments: \'ssh_user\' and \'ssh_private_key\'')
346
+ if str(e) == error_msg_to_check:
347
+ detailed_reason = 'SSH credentials were already cleaned up'
348
+ else:
349
+ raise
322
350
  if is_transient_error:
323
351
  logger.info('Failed to connect to the cluster. Retrying '
324
352
  f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
sky/metrics/utils.py CHANGED
@@ -11,7 +11,9 @@ from typing import List, Optional, Tuple
11
11
  import httpx
12
12
  import prometheus_client as prom
13
13
 
14
+ from sky import sky_logging
14
15
  from sky.skylet import constants
16
+ from sky.utils import common_utils
15
17
  from sky.utils import context_utils
16
18
 
17
19
  _SELECT_TIMEOUT = 1
@@ -35,6 +37,8 @@ _MEM_BUCKETS = [
35
37
  float('inf'),
36
38
  ]
37
39
 
40
+ logger = sky_logging.init_logger(__name__)
41
+
38
42
  # Whether the metrics are enabled, cannot be changed at runtime.
39
43
  METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
40
44
  'false').lower() == 'true'
@@ -188,53 +192,61 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
188
192
  if 'KUBECONFIG' not in env:
189
193
  env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
190
194
 
191
- # start the port forward process
192
- port_forward_process = subprocess.Popen(cmd,
193
- stdout=subprocess.PIPE,
194
- stderr=subprocess.STDOUT,
195
- text=True,
196
- env=env)
197
-
195
+ port_forward_process = None
196
+ port_forward_exit = False
198
197
  local_port = None
199
- start_time = time.time()
200
-
201
- buffer = ''
202
- # wait for the port forward to start and extract the local port
203
- while time.time() - start_time < start_port_forward_timeout:
204
- if port_forward_process.poll() is not None:
205
- # port forward process has terminated
206
- if port_forward_process.returncode != 0:
207
- raise RuntimeError(
208
- f'Port forward failed for service {service} in namespace '
209
- f'{namespace} on context {context}')
210
- break
211
-
212
- # read output line by line to find the local port
213
- if port_forward_process.stdout:
214
- # Wait up to 1s for data to be available without blocking
215
- r, _, _ = select.select([port_forward_process.stdout], [], [],
216
- _SELECT_TIMEOUT)
217
- if r:
218
- # Read available bytes from the FD without blocking
219
- fd = port_forward_process.stdout.fileno()
220
- raw = os.read(fd, _SELECT_BUFFER_SIZE)
221
- chunk = raw.decode(errors='ignore')
222
- buffer += chunk
223
- match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
224
- if match:
225
- local_port = int(match.group(1))
226
- break
227
-
228
- # sleep for 100ms to avoid busy-waiting
229
- time.sleep(0.1)
230
198
 
199
+ try:
200
+ # start the port forward process
201
+ port_forward_process = subprocess.Popen(cmd,
202
+ stdout=subprocess.PIPE,
203
+ stderr=subprocess.STDOUT,
204
+ text=True,
205
+ env=env)
206
+
207
+ start_time = time.time()
208
+
209
+ buffer = ''
210
+ # wait for the port forward to start and extract the local port
211
+ while time.time() - start_time < start_port_forward_timeout:
212
+ if port_forward_process.poll() is not None:
213
+ # port forward process has terminated
214
+ if port_forward_process.returncode != 0:
215
+ port_forward_exit = True
216
+ break
217
+
218
+ # read output line by line to find the local port
219
+ if port_forward_process.stdout:
220
+ # Wait up to 1s for data to be available without blocking
221
+ r, _, _ = select.select([port_forward_process.stdout], [], [],
222
+ _SELECT_TIMEOUT)
223
+ if r:
224
+ # Read available bytes from the FD without blocking
225
+ fd = port_forward_process.stdout.fileno()
226
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
227
+ chunk = raw.decode(errors='ignore')
228
+ buffer += chunk
229
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)',
230
+ buffer)
231
+ if match:
232
+ local_port = int(match.group(1))
233
+ break
234
+
235
+ # sleep for 100ms to avoid busy-waiting
236
+ time.sleep(0.1)
237
+ except BaseException: # pylint: disable=broad-exception-caught
238
+ if port_forward_process:
239
+ stop_svc_port_forward(port_forward_process,
240
+ timeout=terminate_port_forward_timeout)
241
+ raise
242
+ if port_forward_exit:
243
+ raise RuntimeError(f'Port forward failed for service {service} in '
244
+ f'namespace {namespace} on context {context}')
231
245
  if local_port is None:
232
246
  try:
233
- port_forward_process.terminate()
234
- port_forward_process.wait(timeout=terminate_port_forward_timeout)
235
- except subprocess.TimeoutExpired:
236
- port_forward_process.kill()
237
- port_forward_process.wait()
247
+ if port_forward_process:
248
+ stop_svc_port_forward(port_forward_process,
249
+ timeout=terminate_port_forward_timeout)
238
250
  finally:
239
251
  raise RuntimeError(
240
252
  f'Failed to extract local port for service {service} in '
@@ -243,14 +255,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
243
255
  return port_forward_process, local_port
244
256
 
245
257
 
246
- def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
258
+ def stop_svc_port_forward(port_forward_process: subprocess.Popen,
259
+ timeout: int = 5) -> None:
247
260
  """Stops a port forward to a service in a Kubernetes cluster.
248
261
  Args:
249
262
  port_forward_process: The subprocess.Popen process to terminate
250
263
  """
251
264
  try:
252
265
  port_forward_process.terminate()
253
- port_forward_process.wait(timeout=5)
266
+ port_forward_process.wait(timeout=timeout)
254
267
  except subprocess.TimeoutExpired:
255
268
  port_forward_process.kill()
256
269
  port_forward_process.wait()
@@ -301,6 +314,10 @@ async def send_metrics_request_with_port_forward(
301
314
  response.raise_for_status()
302
315
  return response.text
303
316
 
317
+ except Exception as e: # pylint: disable=broad-exception-caught
318
+ logger.error(f'Failed to send metrics request with port forward: '
319
+ f'{common_utils.format_exception(e)}')
320
+ raise
304
321
  finally:
305
322
  # Always clean up port forward
306
323
  if port_forward_process:
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
10
10
  from sky import exceptions
11
11
  from sky import logs
12
12
  from sky import provision
13
+ from sky import resources as resources_lib
13
14
  from sky import sky_logging
14
15
  from sky.provision import common
15
16
  from sky.provision import docker_utils
@@ -92,12 +93,6 @@ def _set_usage_run_id_cmd() -> str:
92
93
  f'{usage_constants.USAGE_RUN_ID_FILE}')
93
94
 
94
95
 
95
- def _set_skypilot_env_var_cmd() -> str:
96
- """Sets the skypilot environment variables on the remote machine."""
97
- env_vars = env_options.Options.all_options()
98
- return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
99
-
100
-
101
96
  def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
102
97
  """Decorator that retries the function if it fails.
103
98
 
@@ -482,11 +477,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
482
477
  @common.log_function_start_end
483
478
  @_auto_retry()
484
479
  @timeline.event
485
- def start_skylet_on_head_node(cluster_name: str,
486
- cluster_info: common.ClusterInfo,
487
- ssh_credentials: Dict[str, Any]) -> None:
480
+ def start_skylet_on_head_node(
481
+ cluster_name: resources_utils.ClusterName,
482
+ cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
483
+ launched_resources: resources_lib.Resources) -> None:
488
484
  """Start skylet on the head node."""
489
- del cluster_name
485
+ # Avoid circular import.
486
+ # pylint: disable=import-outside-toplevel
487
+ from sky.utils import controller_utils
488
+
489
+ def _set_skypilot_env_var_cmd() -> str:
490
+ """Sets the skypilot environment variables on the remote machine."""
491
+ env_vars = {
492
+ k: str(v) for (k, v) in env_options.Options.all_options().items()
493
+ }
494
+ is_controller = controller_utils.Controllers.from_name(
495
+ cluster_name.display_name) is not None
496
+ is_kubernetes = cluster_info.provider_name == 'kubernetes'
497
+ if is_controller and is_kubernetes:
498
+ # For jobs/serve controller, we pass in the CPU and memory limits
499
+ # when starting the skylet to handle cases where these env vars
500
+ # are not set on the cluster's pod spec. The skylet will read
501
+ # these env vars when starting (ManagedJobEvent.start()) and write
502
+ # it to disk.
503
+ resources = launched_resources.assert_launchable()
504
+ vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
505
+ resources.instance_type)
506
+ if vcpus is not None:
507
+ env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
508
+ if mem is not None:
509
+ env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
510
+ return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
511
+
490
512
  runners = provision.get_command_runners(cluster_info.provider_name,
491
513
  cluster_info, **ssh_credentials)
492
514
  head_runner = runners[0]
@@ -1688,7 +1688,10 @@ def check_credentials(context: Optional[str],
1688
1688
  try:
1689
1689
  namespace = get_kube_config_context_namespace(context)
1690
1690
  kubernetes.core_api(context).list_namespaced_pod(
1691
- namespace, _request_timeout=timeout)
1691
+ namespace, limit=1, _request_timeout=timeout)
1692
+ # This call is "free" because this function is a cached call,
1693
+ # and it will not be called again in this function.
1694
+ get_kubernetes_nodes(context=context)
1692
1695
  except ImportError:
1693
1696
  # TODO(romilb): Update these error strs to also include link to docs
1694
1697
  # when docs are ready.
@@ -18,6 +18,7 @@ from sky import exceptions
18
18
  from sky import global_user_state
19
19
  from sky import logs
20
20
  from sky import provision
21
+ from sky import resources as resources_lib
21
22
  from sky import sky_logging
22
23
  from sky import skypilot_config
23
24
  from sky.adaptors import aws
@@ -428,13 +429,14 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
428
429
 
429
430
 
430
431
  def _post_provision_setup(
431
- cloud_name: str, cluster_name: resources_utils.ClusterName,
432
- handle_cluster_yaml: str,
432
+ launched_resources: resources_lib.Resources,
433
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
433
434
  provision_record: provision_common.ProvisionRecord,
434
435
  custom_resource: Optional[str]) -> provision_common.ClusterInfo:
435
436
  config_from_yaml = global_user_state.get_cluster_yaml_dict(
436
437
  handle_cluster_yaml)
437
438
  provider_config = config_from_yaml.get('provider')
439
+ cloud_name = repr(launched_resources.cloud)
438
440
  cluster_info = provision.get_cluster_info(cloud_name,
439
441
  provision_record.region,
440
442
  cluster_name.name_on_cloud,
@@ -694,8 +696,9 @@ def _post_provision_setup(
694
696
  cluster_info,
695
697
  ssh_credentials)
696
698
 
697
- instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
698
- cluster_info, ssh_credentials)
699
+ instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
700
+ ssh_credentials,
701
+ launched_resources)
699
702
 
700
703
  logger.info(
701
704
  ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
@@ -706,8 +709,8 @@ def _post_provision_setup(
706
709
 
707
710
  @timeline.event
708
711
  def post_provision_runtime_setup(
709
- cloud_name: str, cluster_name: resources_utils.ClusterName,
710
- handle_cluster_yaml: str,
712
+ launched_resources: resources_lib.Resources,
713
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
711
714
  provision_record: provision_common.ProvisionRecord,
712
715
  custom_resource: Optional[str],
713
716
  log_dir: str) -> provision_common.ClusterInfo:
@@ -728,7 +731,7 @@ def post_provision_runtime_setup(
728
731
  try:
729
732
  logger.debug(_TITLE.format('System Setup After Provision'))
730
733
  return _post_provision_setup(
731
- cloud_name,
734
+ launched_resources,
732
735
  cluster_name,
733
736
  handle_cluster_yaml=handle_cluster_yaml,
734
737
  provision_record=provision_record,
@@ -206,9 +206,9 @@ class VolumeRecord(ResponseBaseModel):
206
206
  type: str
207
207
  launched_at: int
208
208
  cloud: str
209
- region: str
209
+ region: Optional[str] = None
210
210
  zone: Optional[str] = None
211
- size: str
211
+ size: Optional[str] = None
212
212
  config: Dict[str, Any]
213
213
  name_on_cloud: str
214
214
  user_hash: str
@@ -0,0 +1,66 @@
1
+ """Add ssh keys in filesystem to global user state.
2
+
3
+ Revision ID: 010
4
+ Revises: 009
5
+ Create Date: 2025-10-07
6
+
7
+ """
8
+ import glob
9
+ # pylint: disable=invalid-name
10
+ import os
11
+ from typing import Sequence, Union
12
+
13
+ from alembic import op
14
+ import sqlalchemy as sa
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '010'
18
+ down_revision: Union[str, Sequence[str], None] = '009'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add last_activity_time and launched_at columns to cluster history."""
25
+ connection = op.get_bind()
26
+
27
+ match_dirs = glob.glob(os.path.expanduser('~/.sky/clients/*/ssh'))
28
+ file_user_hashes = set()
29
+ for match_dir in match_dirs:
30
+ user_hash = match_dir.split('/')[-2]
31
+ file_user_hashes.add(user_hash)
32
+
33
+ # Get all existing ssh keys
34
+ existing_user_hashes = set()
35
+ result = connection.execute(sa.text('SELECT user_hash FROM ssh_key'))
36
+ for row in result:
37
+ existing_user_hashes.add(row[0])
38
+
39
+ user_hashes_to_add = file_user_hashes - existing_user_hashes
40
+ for user_hash in user_hashes_to_add:
41
+ match_dir = os.path.join(os.path.expanduser('~/.sky/clients'),
42
+ user_hash, 'ssh')
43
+ public_key_path = os.path.join(match_dir, 'sky-key.pub')
44
+ private_key_path = os.path.join(match_dir, 'sky-key')
45
+ try:
46
+ with open(public_key_path, 'r', encoding='utf-8') as f:
47
+ public_key = f.read().strip()
48
+ with open(private_key_path, 'r', encoding='utf-8') as f:
49
+ private_key = f.read().strip()
50
+ except FileNotFoundError:
51
+ # Skip if the key files are not found
52
+ continue
53
+ connection.execute(
54
+ sa.text('INSERT INTO ssh_key '
55
+ '(user_hash, ssh_public_key, ssh_private_key) '
56
+ 'VALUES (:user_hash, :ssh_public_key, :ssh_private_key) '
57
+ 'ON CONFLICT DO NOTHING'), {
58
+ 'user_hash': user_hash,
59
+ 'ssh_public_key': public_key,
60
+ 'ssh_private_key': private_key
61
+ })
62
+
63
+
64
+ def downgrade():
65
+ """No-op for backward compatibility."""
66
+ pass
sky/server/common.py CHANGED
@@ -950,6 +950,7 @@ def clear_local_api_server_database() -> None:
950
950
  db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
951
951
  for extension in ['', '-shm', '-wal']:
952
952
  try:
953
+ logger.debug(f'Removing database file {db_path}{extension}')
953
954
  os.remove(f'{db_path}{extension}')
954
955
  except FileNotFoundError:
955
956
  logger.debug(f'Database file {db_path}{extension} not found.')