skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250828__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (66) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +11 -10
  3. sky/authentication.py +1 -1
  4. sky/backends/backend.py +3 -5
  5. sky/backends/backend_utils.py +11 -13
  6. sky/backends/cloud_vm_ray_backend.py +11 -22
  7. sky/backends/local_docker_backend.py +3 -8
  8. sky/client/cli/command.py +41 -9
  9. sky/client/sdk.py +23 -8
  10. sky/client/sdk_async.py +6 -2
  11. sky/core.py +1 -4
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/{webpack-6e76f636a048e145.js → webpack-6dae1cd599a34def.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/global_user_state.py +24 -12
  30. sky/jobs/client/sdk.py +5 -2
  31. sky/jobs/recovery_strategy.py +9 -4
  32. sky/logs/agent.py +2 -2
  33. sky/logs/aws.py +6 -3
  34. sky/provision/do/utils.py +2 -1
  35. sky/provision/kubernetes/instance.py +55 -11
  36. sky/provision/kubernetes/utils.py +2 -2
  37. sky/provision/nebius/utils.py +36 -2
  38. sky/serve/client/impl.py +5 -4
  39. sky/serve/replica_managers.py +4 -3
  40. sky/serve/serve_utils.py +2 -2
  41. sky/serve/server/impl.py +3 -2
  42. sky/server/auth/oauth2_proxy.py +10 -4
  43. sky/server/common.py +3 -2
  44. sky/server/daemons.py +10 -5
  45. sky/server/requests/executor.py +2 -1
  46. sky/server/requests/requests.py +21 -0
  47. sky/server/server.py +16 -0
  48. sky/skylet/events.py +2 -3
  49. sky/skypilot_config.py +10 -10
  50. sky/task.py +1 -1
  51. sky/templates/nebius-ray.yml.j2 +4 -8
  52. sky/usage/usage_lib.py +3 -2
  53. sky/utils/common_utils.py +0 -72
  54. sky/utils/controller_utils.py +4 -3
  55. sky/utils/dag_utils.py +4 -4
  56. sky/utils/kubernetes/config_map_utils.py +3 -3
  57. sky/utils/schemas.py +3 -0
  58. sky/utils/yaml_utils.py +77 -10
  59. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/METADATA +1 -1
  60. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/RECORD +66 -66
  61. /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → 9DW6d9jaP2kZt0NcgIfFa}/_buildManifest.js +0 -0
  62. /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → 9DW6d9jaP2kZt0NcgIfFa}/_ssgManifest.js +0 -0
  63. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/WHEEL +0 -0
  64. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/entry_points.txt +0 -0
  65. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/licenses/LICENSE +0 -0
  66. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
+ import datetime
3
4
  import json
4
5
  import re
5
6
  import time
@@ -1254,9 +1255,11 @@ def get_cluster_info(
1254
1255
  provider_config=provider_config)
1255
1256
 
1256
1257
 
1257
- def _get_pod_termination_reason(pod: Any) -> str:
1258
+ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1259
+ """Get pod termination reason and write to cluster events."""
1258
1260
  reasons = []
1259
- if pod.status.container_statuses:
1261
+ latest_timestamp = pod.status.start_time or datetime.datetime.min
1262
+ if pod.status and pod.status.container_statuses:
1260
1263
  for container_status in pod.status.container_statuses:
1261
1264
  terminated = container_status.state.terminated
1262
1265
  if terminated:
@@ -1264,20 +1267,38 @@ def _get_pod_termination_reason(pod: Any) -> str:
1264
1267
  reason = terminated.reason
1265
1268
  if exit_code == 0:
1266
1269
  # skip exit 0 (non-failed) just for sanity
1270
+ logger.debug(f'{pod.metadata.name}/{container_status.name} '
1271
+ 'had exit code 0. Skipping.')
1267
1272
  continue
1268
1273
  if reason is None:
1269
1274
  # just in-case reason is None, have default for debugging
1270
1275
  reason = f'exit({exit_code})'
1271
1276
  reasons.append(reason)
1277
+ if terminated.finished_at > latest_timestamp:
1278
+ latest_timestamp = terminated.finished_at
1279
+
1272
1280
  # TODO (kyuds): later, if needed, query `last_state` too.
1273
1281
 
1282
+ if not reasons:
1283
+ return ''
1284
+
1274
1285
  # Normally we will have a single container per pod for skypilot
1275
1286
  # but doing this just in-case there are multiple containers.
1276
- return ' | '.join(reasons)
1287
+ pod_reason = ' | '.join(reasons)
1288
+
1289
+ global_user_state.add_cluster_event(
1290
+ cluster_name,
1291
+ None,
1292
+ f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
1293
+ global_user_state.ClusterEventType.DEBUG,
1294
+ transitioned_at=int(latest_timestamp.timestamp()),
1295
+ )
1296
+ return pod_reason
1277
1297
 
1278
1298
 
1279
1299
  def _get_pod_missing_reason(context: Optional[str], namespace: str,
1280
1300
  cluster_name: str, pod_name: str) -> Optional[str]:
1301
+ """Get events for missing pod and write to cluster events."""
1281
1302
  logger.debug(f'Analyzing events for pod {pod_name}')
1282
1303
  pod_field_selector = (
1283
1304
  f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
@@ -1293,6 +1314,8 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1293
1314
  last_scheduled_node = None
1294
1315
  insert_new_pod_event = True
1295
1316
  new_event_inserted = False
1317
+ inserted_pod_events = 0
1318
+
1296
1319
  for event in pod_events:
1297
1320
  if event.reason == 'Scheduled':
1298
1321
  pattern = r'Successfully assigned (\S+) to (\S+)'
@@ -1313,10 +1336,18 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1313
1336
  transitioned_at=int(
1314
1337
  event.metadata.creation_timestamp.timestamp()),
1315
1338
  expose_duplicate_error=True)
1339
+ logger.debug(f'[pod {pod_name}] encountered new pod event: '
1340
+ f'{event.metadata.creation_timestamp} '
1341
+ f'{event.reason} {event.message}')
1316
1342
  except db_utils.UniqueConstraintViolationError:
1317
1343
  insert_new_pod_event = False
1318
1344
  else:
1319
1345
  new_event_inserted = True
1346
+ inserted_pod_events += 1
1347
+
1348
+ logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
1349
+ f'inserted {inserted_pod_events} new pod events '
1350
+ 'previously unseen')
1320
1351
 
1321
1352
  if last_scheduled_node is not None:
1322
1353
  node_field_selector = ('involvedObject.kind=Node,'
@@ -1331,6 +1362,7 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1331
1362
  # latest event appears first
1332
1363
  reverse=True)
1333
1364
  insert_new_node_event = True
1365
+ inserted_node_events = 0
1334
1366
  for event in node_events:
1335
1367
  if insert_new_node_event:
1336
1368
  # Try inserting the latest events first. If the event is a
@@ -1345,10 +1377,23 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1345
1377
  transitioned_at=int(
1346
1378
  event.metadata.creation_timestamp.timestamp()),
1347
1379
  expose_duplicate_error=True)
1380
+ logger.debug(
1381
+ f'[pod {pod_name}] encountered new node event: '
1382
+ f'{event.metadata.creation_timestamp} '
1383
+ f'{event.reason} {event.message}')
1348
1384
  except db_utils.UniqueConstraintViolationError:
1349
1385
  insert_new_node_event = False
1350
1386
  else:
1351
1387
  new_event_inserted = True
1388
+ inserted_node_events += 1
1389
+
1390
+ logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
1391
+ f'processed {len(node_events)} node events and '
1392
+ f'inserted {inserted_node_events} new node events '
1393
+ 'previously unseen')
1394
+ else:
1395
+ logger.debug(f'[pod {pod_name}] could not determine the node '
1396
+ 'the pod was scheduled to')
1352
1397
 
1353
1398
  if not new_event_inserted:
1354
1399
  # If new event is not inserted, there is no useful information to
@@ -1390,13 +1435,15 @@ def query_instances(
1390
1435
  provider_config: Optional[Dict[str, Any]] = None,
1391
1436
  non_terminated_only: bool = True
1392
1437
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1438
+ # Mapping from pod phase to skypilot status. These are the only valid pod
1439
+ # phases.
1440
+ # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1393
1441
  status_map = {
1394
1442
  'Pending': status_lib.ClusterStatus.INIT,
1395
1443
  'Running': status_lib.ClusterStatus.UP,
1396
1444
  'Failed': status_lib.ClusterStatus.INIT,
1397
1445
  'Unknown': None,
1398
1446
  'Succeeded': None,
1399
- 'Terminating': None,
1400
1447
  }
1401
1448
 
1402
1449
  assert provider_config is not None
@@ -1440,18 +1487,15 @@ def query_instances(
1440
1487
  for pod in pods:
1441
1488
  phase = pod.status.phase
1442
1489
  pod_status = status_map[phase]
1490
+ reason = None
1491
+ if phase in ('Failed', 'Unknown'):
1492
+ reason = _get_pod_termination_reason(pod, cluster_name)
1493
+ logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
1443
1494
  if non_terminated_only and pod_status is None:
1444
1495
  logger.debug(f'Pod {pod.metadata.name} is terminated, but '
1445
1496
  'query_instances is called with '
1446
1497
  f'non_terminated_only=True. Phase: {phase}')
1447
- if phase == 'Failed':
1448
- reason_for_debug = _get_pod_termination_reason(pod)
1449
- logger.debug(f'Termination reason: {reason_for_debug}')
1450
1498
  continue
1451
- reason = None
1452
- if phase == 'Failed':
1453
- reason = _get_pod_termination_reason(pod)
1454
- logger.debug(f'Pod Status Reason(s): {reason}')
1455
1499
  pod_name = pod.metadata.name
1456
1500
  reason = f'{pod_name}: {reason}' if reason is not None else None
1457
1501
  cluster_status[pod_name] = (pod_status, reason)
@@ -2782,7 +2782,7 @@ def combine_pod_config_fields(
2782
2782
  kubernetes_config)
2783
2783
 
2784
2784
  # Write the updated YAML back to the file
2785
- common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2785
+ yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2786
2786
 
2787
2787
 
2788
2788
  def combine_metadata_fields(cluster_yaml_path: str,
@@ -2834,7 +2834,7 @@ def combine_metadata_fields(cluster_yaml_path: str,
2834
2834
  config_utils.merge_k8s_configs(destination, custom_metadata)
2835
2835
 
2836
2836
  # Write the updated YAML back to the file
2837
- common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2837
+ yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2838
2838
 
2839
2839
 
2840
2840
  def merge_custom_metadata(
@@ -14,6 +14,8 @@ logger = sky_logging.init_logger(__name__)
14
14
 
15
15
  POLL_INTERVAL = 5
16
16
 
17
+ _MAX_OPERATIONS_TO_FETCH = 1000
18
+
17
19
 
18
20
  def retry(func):
19
21
  """Decorator to retry a function."""
@@ -321,11 +323,43 @@ def launch(cluster_name_on_cloud: str,
321
323
  parent_id=project_id,
322
324
  name=instance_name,
323
325
  )))
326
+ instance_id = instance.metadata.id
324
327
  if instance.status.state.name == 'STARTING':
325
- instance_id = instance.metadata.id
326
328
  break
329
+
330
+ # All Instances initially have state=STOPPED and reconciling=True,
331
+ # so we need to wait until reconciling is False.
332
+ if instance.status.state.name == 'STOPPED' and \
333
+ not instance.status.reconciling:
334
+ next_token = ''
335
+ total_operations = 0
336
+ while True:
337
+ operations_response = nebius.sync_call(
338
+ service.list_operations_by_parent(
339
+ nebius.compute().ListOperationsByParentRequest(
340
+ parent_id=project_id,
341
+ page_size=100,
342
+ page_token=next_token,
343
+ )))
344
+ total_operations += len(operations_response.operations)
345
+ for operation in operations_response.operations:
346
+ # Find the most recent operation for the instance.
347
+ if operation.resource_id == instance_id:
348
+ error_msg = operation.description
349
+ if operation.status:
350
+ error_msg += f' {operation.status.message}'
351
+ raise RuntimeError(error_msg)
352
+ # If we've fetched too many operations, or there are no more
353
+ # operations to fetch, just raise a generic error.
354
+ if total_operations > _MAX_OPERATIONS_TO_FETCH or \
355
+ not operations_response.next_page_token:
356
+ raise RuntimeError(
357
+ f'Instance {instance_name} failed to start.')
358
+ next_token = operations_response.next_page_token
327
359
  time.sleep(POLL_INTERVAL)
328
- logger.debug(f'Waiting for instance {instance_name} start running.')
360
+ logger.debug(f'Waiting for instance {instance_name} to start running. '
361
+ f'State: {instance.status.state.name}, '
362
+ f'Reconciling: {instance.status.reconciling}')
329
363
  retry_count += 1
330
364
 
331
365
  if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
sky/serve/client/impl.py CHANGED
@@ -224,10 +224,11 @@ def tail_logs(service_name: str,
224
224
  stream=True)
225
225
  request_id: server_common.RequestId[None] = server_common.get_request_id(
226
226
  response)
227
- return sdk.stream_response(request_id=request_id,
228
- response=response,
229
- output_stream=output_stream,
230
- resumable=True)
227
+ sdk.stream_response(request_id=request_id,
228
+ response=response,
229
+ output_stream=output_stream,
230
+ resumable=True,
231
+ get_result=follow)
231
232
 
232
233
 
233
234
  def sync_down_logs(service_name: str,
@@ -37,6 +37,7 @@ from sky.utils import env_options
37
37
  from sky.utils import resources_utils
38
38
  from sky.utils import status_lib
39
39
  from sky.utils import ux_utils
40
+ from sky.utils import yaml_utils
40
41
 
41
42
  if typing.TYPE_CHECKING:
42
43
  from sky.serve import service_spec
@@ -79,7 +80,7 @@ def launch_cluster(replica_id: int,
79
80
  f'{cluster_name} with resources override: '
80
81
  f'{resources_override}')
81
82
  try:
82
- config = common_utils.read_yaml(
83
+ config = yaml_utils.read_yaml(
83
84
  os.path.expanduser(service_task_yaml_path))
84
85
  task = task_lib.Task.from_yaml_config(config)
85
86
  if resources_override is not None:
@@ -1397,7 +1398,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1397
1398
  # the latest version. This can significantly improve the speed
1398
1399
  # for updating an existing service with only config changes to the
1399
1400
  # service specs, e.g. scale down the service.
1400
- new_config = common_utils.read_yaml(
1401
+ new_config = yaml_utils.read_yaml(
1401
1402
  os.path.expanduser(service_task_yaml_path))
1402
1403
  # Always create new replicas and scale down old ones when file_mounts
1403
1404
  # are not empty.
@@ -1414,7 +1415,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1414
1415
  old_service_task_yaml_path = (
1415
1416
  serve_utils.generate_task_yaml_file_name(
1416
1417
  self._service_name, info.version))
1417
- old_config = common_utils.read_yaml(
1418
+ old_config = yaml_utils.read_yaml(
1418
1419
  os.path.expanduser(old_service_task_yaml_path))
1419
1420
  for key in ['service', 'pool', '_user_specified_yaml']:
1420
1421
  old_config.pop(key, None)
sky/serve/serve_utils.py CHANGED
@@ -699,7 +699,7 @@ def _get_service_status(
699
699
  if record['pool']:
700
700
  latest_yaml_path = generate_task_yaml_file_name(service_name,
701
701
  record['version'])
702
- raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
702
+ raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
703
703
  original_config = raw_yaml_config.get('_user_specified_yaml')
704
704
  if original_config is None:
705
705
  # Fall back to old display format.
@@ -711,7 +711,7 @@ def _get_service_status(
711
711
  original_config['pool'] = svc # Add pool to root config
712
712
  else:
713
713
  original_config = yaml_utils.safe_load(original_config)
714
- record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
714
+ record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
715
715
 
716
716
  record['target_num_replicas'] = 0
717
717
  try:
sky/serve/server/impl.py CHANGED
@@ -34,6 +34,7 @@ from sky.utils import dag_utils
34
34
  from sky.utils import rich_utils
35
35
  from sky.utils import subprocess_utils
36
36
  from sky.utils import ux_utils
37
+ from sky.utils import yaml_utils
37
38
 
38
39
  logger = sky_logging.init_logger(__name__)
39
40
 
@@ -179,7 +180,7 @@ def up(
179
180
  controller = controller_utils.get_controller_for_pool(pool)
180
181
  controller_name = controller.value.cluster_name
181
182
  task_config = task.to_yaml_config()
182
- common_utils.dump_yaml(service_file.name, task_config)
183
+ yaml_utils.dump_yaml(service_file.name, task_config)
183
184
  remote_tmp_task_yaml_path = (
184
185
  serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
185
186
  remote_config_yaml_path = (
@@ -531,7 +532,7 @@ def update(
531
532
  prefix=f'{service_name}-v{current_version}',
532
533
  mode='w') as service_file:
533
534
  task_config = task.to_yaml_config()
534
- common_utils.dump_yaml(service_file.name, task_config)
535
+ yaml_utils.dump_yaml(service_file.name, task_config)
535
536
  remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
536
537
  service_name, current_version, expand_user=False)
537
538
 
@@ -4,6 +4,7 @@ import asyncio
4
4
  import hashlib
5
5
  import http
6
6
  import os
7
+ import traceback
7
8
  from typing import Optional
8
9
  import urllib
9
10
 
@@ -109,8 +110,8 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
109
110
  try:
110
111
  return await self._authenticate(request, call_next, session)
111
112
  except (aiohttp.ClientError, asyncio.TimeoutError) as e:
112
- logger.error(f'Error communicating with OAuth2 proxy: {e}')
113
- # Fail open or closed based on your security requirements
113
+ logger.error(f'Error communicating with OAuth2 proxy: {e}'
114
+ f'{traceback.format_exc()}')
114
115
  return fastapi.responses.JSONResponse(
115
116
  status_code=http.HTTPStatus.BAD_GATEWAY,
116
117
  content={'detail': 'oauth2-proxy service unavailable'})
@@ -120,10 +121,15 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
120
121
  forwarded_headers = dict(request.headers)
121
122
  auth_url = f'{self.proxy_base}/oauth2/auth'
122
123
  forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
123
- logger.debug(f'authenticate request: {request.url.path}')
124
+ # Remove content-length and content-type headers and drop request body
125
+ # to reduce the auth overhead.
126
+ forwarded_headers.pop('content-length', None)
127
+ forwarded_headers.pop('content-type', None)
128
+ logger.debug(f'authenticate request: {auth_url}, '
129
+ f'headers: {forwarded_headers}')
124
130
 
125
131
  async with session.request(
126
- method=request.method,
132
+ method='GET',
127
133
  url=auth_url,
128
134
  headers=forwarded_headers,
129
135
  cookies=request.cookies,
sky/server/common.py CHANGED
@@ -41,6 +41,7 @@ from sky.utils import annotations
41
41
  from sky.utils import common_utils
42
42
  from sky.utils import rich_utils
43
43
  from sky.utils import ux_utils
44
+ from sky.utils import yaml_utils
44
45
 
45
46
  if typing.TYPE_CHECKING:
46
47
  import aiohttp
@@ -816,7 +817,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
816
817
  return str(client_file_mounts_dir /
817
818
  file_mounts_mapping[original_path].lstrip('/'))
818
819
 
819
- task_configs = common_utils.read_yaml_all(str(client_task_path))
820
+ task_configs = yaml_utils.read_yaml_all(str(client_task_path))
820
821
  for task_config in task_configs:
821
822
  if task_config is None:
822
823
  continue
@@ -869,7 +870,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
869
870
  # We can switch to using string, but this is to make it easier to debug, by
870
871
  # persisting the translated task yaml file.
871
872
  translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
872
- common_utils.dump_yaml(str(translated_client_task_path), task_configs)
873
+ yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
873
874
 
874
875
  dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
875
876
  return dag
sky/server/daemons.py CHANGED
@@ -191,23 +191,28 @@ INTERNAL_REQUEST_DAEMONS = [
191
191
  # set to updated status automatically, without showing users the hint of
192
192
  # cluster being stopped or down when `sky status -r` is called.
193
193
  InternalRequestDaemon(id='skypilot-status-refresh-daemon',
194
- name='status',
194
+ name='status-refresh',
195
195
  event_fn=refresh_cluster_status_event,
196
196
  default_log_level='DEBUG'),
197
197
  # Volume status refresh daemon to update the volume status periodically.
198
198
  InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
199
- name='volume',
199
+ name='volume-refresh',
200
200
  event_fn=refresh_volume_status_event),
201
201
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
202
- name='managed-job-status',
202
+ name='managed-job-status-refresh',
203
203
  event_fn=managed_job_status_refresh_event,
204
204
  should_skip=should_skip_managed_job_status_refresh),
205
205
  InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
206
- name='sky-serve-status',
206
+ name='sky-serve-status-refresh',
207
207
  event_fn=sky_serve_status_refresh_event,
208
208
  should_skip=should_skip_sky_serve_status_refresh),
209
209
  InternalRequestDaemon(id='pool-status-refresh-daemon',
210
- name='pool-status',
210
+ name='pool-status-refresh',
211
211
  event_fn=pool_status_refresh_event,
212
212
  should_skip=should_skip_pool_status_refresh),
213
213
  ]
214
+
215
+
216
+ def is_daemon_request_id(request_id: str) -> bool:
217
+ """Returns whether a specific request_id is an internal daemon."""
218
+ return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])
@@ -55,6 +55,7 @@ from sky.utils import context_utils
55
55
  from sky.utils import subprocess_utils
56
56
  from sky.utils import tempstore
57
57
  from sky.utils import timeline
58
+ from sky.utils import yaml_utils
58
59
  from sky.workspaces import core as workspaces_core
59
60
 
60
61
  if typing.TYPE_CHECKING:
@@ -387,7 +388,7 @@ def _request_execution_wrapper(request_id: str,
387
388
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
388
389
  config = skypilot_config.to_dict()
389
390
  logger.debug(f'request config: \n'
390
- f'{common_utils.dump_yaml_str(dict(config))}')
391
+ f'{yaml_utils.dump_yaml_str(dict(config))}')
391
392
  return_value = func(**request_body.to_kwargs())
392
393
  f.flush()
393
394
  except KeyboardInterrupt:
@@ -565,6 +565,27 @@ def get_request_tasks(
565
565
  return requests
566
566
 
567
567
 
568
+ @init_db
569
+ def get_api_request_ids_start_with(incomplete: str) -> List[str]:
570
+ """Get a list of API request ids for shell completion."""
571
+ assert _DB is not None
572
+ with _DB.conn:
573
+ cursor = _DB.conn.cursor()
574
+ # Prioritize alive requests (PENDING, RUNNING) over finished ones,
575
+ # then order by creation time (newest first) within each category.
576
+ cursor.execute(
577
+ f"""SELECT request_id FROM {REQUEST_TABLE}
578
+ WHERE request_id LIKE ?
579
+ ORDER BY
580
+ CASE
581
+ WHEN status IN ('PENDING', 'RUNNING') THEN 0
582
+ ELSE 1
583
+ END,
584
+ created_at DESC
585
+ LIMIT 1000""", (f'{incomplete}%',))
586
+ return [row[0] for row in cursor.fetchall()]
587
+
588
+
568
589
  def _add_or_update_request_no_lock(request: Request):
569
590
  """Add or update a REST request into the database."""
570
591
  row = request.to_row()
sky/server/server.py CHANGED
@@ -1403,6 +1403,9 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
1403
1403
  raise fastapi.HTTPException(
1404
1404
  status_code=500, detail=request_task.encode().model_dump())
1405
1405
  return request_task.encode()
1406
+ elif (request_task.status == requests_lib.RequestStatus.RUNNING and
1407
+ daemons.is_daemon_request_id(request_id)):
1408
+ return request_task.encode()
1406
1409
  # yield control to allow other coroutines to run, sleep shortly
1407
1410
  # to avoid storming the DB and CPU in the meantime
1408
1411
  await asyncio.sleep(0.1)
@@ -1491,6 +1494,14 @@ async def stream(
1491
1494
  if log_path == constants.API_SERVER_LOGS:
1492
1495
  resolved_log_path = pathlib.Path(
1493
1496
  constants.API_SERVER_LOGS).expanduser()
1497
+ if not resolved_log_path.exists():
1498
+ raise fastapi.HTTPException(
1499
+ status_code=404,
1500
+ detail='Server log file does not exist. The API server may '
1501
+ 'have been started with `--foreground` - check the '
1502
+ 'stdout of API server process, such as: '
1503
+ '`kubectl logs -n api-server-namespace '
1504
+ 'api-server-pod-name`')
1494
1505
  else:
1495
1506
  # This should be a log path under ~/sky_logs.
1496
1507
  resolved_logs_directory = pathlib.Path(
@@ -1769,6 +1780,11 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
1769
1780
  return global_user_state.get_volume_names_start_with(incomplete)
1770
1781
 
1771
1782
 
1783
+ @app.get('/api/completion/api_request')
1784
+ async def complete_api_request(incomplete: str,) -> List[str]:
1785
+ return requests_lib.get_api_request_ids_start_with(incomplete)
1786
+
1787
+
1772
1788
  @app.get('/dashboard/{full_path:path}')
1773
1789
  async def serve_dashboard(full_path: str):
1774
1790
  """Serves the Next.js dashboard application.
sky/skylet/events.py CHANGED
@@ -20,7 +20,6 @@ from sky.skylet import constants
20
20
  from sky.skylet import job_lib
21
21
  from sky.usage import usage_lib
22
22
  from sky.utils import cluster_utils
23
- from sky.utils import common_utils
24
23
  from sky.utils import registry
25
24
  from sky.utils import ux_utils
26
25
  from sky.utils import yaml_utils
@@ -181,7 +180,7 @@ class AutostopEvent(SkyletEvent):
181
180
 
182
181
  config_path = os.path.abspath(
183
182
  os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
184
- config = common_utils.read_yaml(config_path)
183
+ config = yaml_utils.read_yaml(config_path)
185
184
  provider_name = cluster_utils.get_provider_name(config)
186
185
  cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
187
186
  assert cloud is not None, f'Unknown cloud: {provider_name}'
@@ -326,5 +325,5 @@ class AutostopEvent(SkyletEvent):
326
325
  config['auth'].pop('ssh_proxy_command', None)
327
326
  # Empty the file_mounts.
328
327
  config['file_mounts'] = {}
329
- common_utils.dump_yaml(yaml_path, config)
328
+ yaml_utils.dump_yaml(yaml_path, config)
330
329
  logger.debug('Replaced upscaling speed to 0.')
sky/skypilot_config.py CHANGED
@@ -494,7 +494,7 @@ def reload_config() -> None:
494
494
  def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
495
495
  config = config_utils.Config()
496
496
  try:
497
- config_dict = common_utils.read_yaml(config_path)
497
+ config_dict = yaml_utils.read_yaml(config_path)
498
498
  config = config_utils.Config.from_dict(config_dict)
499
499
  # pop the db url from the config, and set it to the env var.
500
500
  # this is to avoid db url (considered a sensitive value)
@@ -504,7 +504,7 @@ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
504
504
  os.environ[constants.ENV_VAR_DB_CONNECTION_URI] = db_url
505
505
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
506
506
  logger.debug(f'Config loaded from {config_path}:\n'
507
- f'{common_utils.dump_yaml_str(dict(config))}')
507
+ f'{yaml_utils.dump_yaml_str(dict(config))}')
508
508
  except yaml.YAMLError as e:
509
509
  logger.error(f'Error in loading config file ({config_path}):', e)
510
510
  if config:
@@ -600,7 +600,7 @@ def _reload_config_as_server() -> None:
600
600
  sqlalchemy_engine.dispose()
601
601
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
602
602
  logger.debug(f'server config: \n'
603
- f'{common_utils.dump_yaml_str(dict(server_config))}')
603
+ f'{yaml_utils.dump_yaml_str(dict(server_config))}')
604
604
  _set_loaded_config(server_config)
605
605
  _set_loaded_config_path(server_config_path)
606
606
 
@@ -628,7 +628,7 @@ def _reload_config_as_client() -> None:
628
628
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
629
629
  logger.debug(
630
630
  f'client config (before task and CLI overrides): \n'
631
- f'{common_utils.dump_yaml_str(dict(overlaid_client_config))}')
631
+ f'{yaml_utils.dump_yaml_str(dict(overlaid_client_config))}')
632
632
  _set_loaded_config(overlaid_client_config)
633
633
  _set_loaded_config_path([user_config_path, project_config_path])
634
634
 
@@ -738,9 +738,9 @@ def override_skypilot_config(
738
738
  'Failed to override the SkyPilot config on API '
739
739
  'server with your local SkyPilot config:\n'
740
740
  '=== SkyPilot config on API server ===\n'
741
- f'{common_utils.dump_yaml_str(dict(original_config))}\n'
741
+ f'{yaml_utils.dump_yaml_str(dict(original_config))}\n'
742
742
  '=== Your local SkyPilot config ===\n'
743
- f'{common_utils.dump_yaml_str(dict(override_configs))}\n'
743
+ f'{yaml_utils.dump_yaml_str(dict(override_configs))}\n'
744
744
  f'Details: {e}') from e
745
745
  finally:
746
746
  _set_loaded_config(original_config)
@@ -767,7 +767,7 @@ def replace_skypilot_config(new_configs: config_utils.Config) -> Iterator[None]:
767
767
  mode='w',
768
768
  prefix='mutated-skypilot-config-',
769
769
  suffix='.yaml') as temp_file:
770
- common_utils.dump_yaml(temp_file.name, dict(**new_configs))
770
+ yaml_utils.dump_yaml(temp_file.name, dict(**new_configs))
771
771
  # Modify the env var of current process or context so that the
772
772
  # new config will be used by spawned sub-processes.
773
773
  # Note that this code modifies os.environ directly because it
@@ -831,7 +831,7 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
831
831
  parsed_config = _compose_cli_config(cli_config)
832
832
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
833
833
  logger.debug(f'applying following CLI overrides: \n'
834
- f'{common_utils.dump_yaml_str(dict(parsed_config))}')
834
+ f'{yaml_utils.dump_yaml_str(dict(parsed_config))}')
835
835
  _set_loaded_config(
836
836
  overlay_skypilot_config(original_config=_get_loaded_config(),
837
837
  override_configs=parsed_config))
@@ -875,7 +875,7 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
875
875
  def _set_config_yaml_to_db(key: str,
876
876
  config: config_utils.Config):
877
877
  assert sqlalchemy_engine is not None
878
- config_str = common_utils.dump_yaml_str(dict(config))
878
+ config_str = yaml_utils.dump_yaml_str(dict(config))
879
879
  with orm.Session(sqlalchemy_engine) as session:
880
880
  if (sqlalchemy_engine.dialect.name ==
881
881
  db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -901,7 +901,7 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
901
901
 
902
902
  if not db_updated:
903
903
  # save to the local file (PVC in Kubernetes, local file otherwise)
904
- common_utils.dump_yaml(global_config_path, dict(config))
904
+ yaml_utils.dump_yaml(global_config_path, dict(config))
905
905
 
906
906
  if config_map_utils.is_running_in_kubernetes():
907
907
  # In Kubernetes, sync the PVC config to ConfigMap for user
sky/task.py CHANGED
@@ -564,7 +564,7 @@ class Task:
564
564
  secrets_overrides: Optional[List[Tuple[str, str]]] = None,
565
565
  ) -> 'Task':
566
566
  user_specified_yaml = config.pop('_user_specified_yaml',
567
- common_utils.dump_yaml_str(config))
567
+ yaml_utils.dump_yaml_str(config))
568
568
  # More robust handling for 'envs': explicitly convert keys and values to
569
569
  # str, since users may pass '123' as keys/values which will get parsed
570
570
  # as int causing validate_schema() to fail.
@@ -56,15 +56,11 @@ available_node_types:
56
56
  filesystem_mount_path: {{ fs.filesystem_mount_path }}
57
57
  {%- endfor %}
58
58
  UserData: |
59
- runcmd:
60
- - sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
61
- - systemctl restart sshd
62
-
63
59
  {# Two available OS images:
64
- 1. ubuntu22.04-driverless - requires Docker installation
65
- 2. ubuntu22.04-cuda12 - comes with Docker pre-installed
66
- To optimize deployment speed, Docker is only installed when using ubuntu22.04-driverless #}
67
- {%- if docker_image is not none and image_id == 'ubuntu22.04-driverless' %}
60
+ 1. ubuntu24.04-driverless - requires Docker installation
61
+ 2. ubuntu24.04-cuda12 - comes with Docker pre-installed
62
+ To optimize deployment speed, Docker is only installed when using ubuntu24.04-driverless #}
63
+ {%- if docker_image is not none and image_id.endswith('-driverless') %}
68
64
  apt:
69
65
  sources:
70
66
  docker.list: