skypilot-nightly 1.0.0.dev20250826__py3-none-any.whl → 1.0.0.dev20250828__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (83) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +11 -10
  3. sky/authentication.py +4 -10
  4. sky/backends/backend.py +3 -5
  5. sky/backends/backend_utils.py +41 -56
  6. sky/backends/cloud_vm_ray_backend.py +13 -24
  7. sky/backends/local_docker_backend.py +3 -8
  8. sky/client/cli/command.py +43 -10
  9. sky/client/common.py +41 -14
  10. sky/client/sdk.py +24 -9
  11. sky/client/sdk_async.py +6 -2
  12. sky/clouds/aws.py +1 -1
  13. sky/clouds/cloud.py +15 -0
  14. sky/clouds/kubernetes.py +27 -0
  15. sky/clouds/ssh.py +2 -3
  16. sky/core.py +1 -4
  17. sky/dashboard/out/404.html +1 -1
  18. sky/dashboard/out/_next/static/chunks/{webpack-6e76f636a048e145.js → webpack-6dae1cd599a34def.js} +1 -1
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  28. sky/dashboard/out/jobs.html +1 -1
  29. sky/dashboard/out/users.html +1 -1
  30. sky/dashboard/out/volumes.html +1 -1
  31. sky/dashboard/out/workspace/new.html +1 -1
  32. sky/dashboard/out/workspaces/[name].html +1 -1
  33. sky/dashboard/out/workspaces.html +1 -1
  34. sky/global_user_state.py +127 -23
  35. sky/jobs/client/sdk.py +5 -2
  36. sky/jobs/recovery_strategy.py +9 -4
  37. sky/logs/agent.py +2 -2
  38. sky/logs/aws.py +6 -3
  39. sky/provision/do/utils.py +2 -1
  40. sky/provision/kubernetes/config.py +2 -8
  41. sky/provision/kubernetes/instance.py +58 -8
  42. sky/provision/kubernetes/network_utils.py +3 -4
  43. sky/provision/kubernetes/utils.py +8 -7
  44. sky/provision/nebius/utils.py +51 -9
  45. sky/provision/vsphere/vsphere_utils.py +2 -8
  46. sky/schemas/api/responses.py +7 -0
  47. sky/serve/client/impl.py +5 -4
  48. sky/serve/replica_managers.py +4 -3
  49. sky/serve/serve_utils.py +4 -4
  50. sky/serve/server/impl.py +3 -2
  51. sky/serve/service_spec.py +2 -8
  52. sky/server/auth/authn.py +4 -0
  53. sky/server/auth/oauth2_proxy.py +10 -4
  54. sky/server/common.py +10 -3
  55. sky/server/daemons.py +10 -5
  56. sky/server/requests/executor.py +6 -1
  57. sky/server/requests/requests.py +21 -0
  58. sky/server/server.py +34 -33
  59. sky/server/uvicorn.py +33 -0
  60. sky/setup_files/dependencies.py +1 -0
  61. sky/sky_logging.py +4 -1
  62. sky/skylet/events.py +4 -5
  63. sky/skypilot_config.py +14 -12
  64. sky/ssh_node_pools/core.py +3 -1
  65. sky/task.py +4 -10
  66. sky/templates/nebius-ray.yml.j2 +4 -8
  67. sky/usage/usage_lib.py +3 -2
  68. sky/users/server.py +6 -6
  69. sky/utils/common_utils.py +0 -71
  70. sky/utils/controller_utils.py +4 -3
  71. sky/utils/dag_utils.py +4 -4
  72. sky/utils/kubernetes/config_map_utils.py +3 -3
  73. sky/utils/schemas.py +3 -0
  74. sky/utils/yaml_utils.py +102 -0
  75. sky/volumes/volume.py +8 -3
  76. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/METADATA +2 -1
  77. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/RECORD +83 -82
  78. /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_buildManifest.js +0 -0
  79. /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_ssgManifest.js +0 -0
  80. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/WHEEL +0 -0
  81. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/entry_points.txt +0 -0
  82. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/licenses/LICENSE +0 -0
  83. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ from sky.utils import schemas
38
38
  from sky.utils import status_lib
39
39
  from sky.utils import timeline
40
40
  from sky.utils import ux_utils
41
+ from sky.utils import yaml_utils
41
42
 
42
43
  if typing.TYPE_CHECKING:
43
44
  import jinja2
@@ -1898,7 +1899,7 @@ def is_kubeconfig_exec_auth(
1898
1899
 
1899
1900
  # Load the kubeconfig for the context
1900
1901
  kubeconfig_text = _get_kubeconfig_text_for_context(context)
1901
- kubeconfig = yaml.safe_load(kubeconfig_text)
1902
+ kubeconfig = yaml_utils.safe_load(kubeconfig_text)
1902
1903
 
1903
1904
  # Get the user details
1904
1905
  user_details = kubeconfig['users']
@@ -2601,7 +2602,7 @@ def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2601
2602
  image=ssh_jump_image,
2602
2603
  secret=ssh_key_secret,
2603
2604
  service_type=service_type)
2604
- content = yaml.safe_load(cont)
2605
+ content = yaml_utils.safe_load(cont)
2605
2606
  return content
2606
2607
 
2607
2608
 
@@ -2750,7 +2751,7 @@ def combine_pod_config_fields(
2750
2751
  """
2751
2752
  with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2752
2753
  yaml_content = f.read()
2753
- yaml_obj = yaml.safe_load(yaml_content)
2754
+ yaml_obj = yaml_utils.safe_load(yaml_content)
2754
2755
  # We don't use override_configs in `get_effective_region_config`, as merging
2755
2756
  # the pod config requires special handling.
2756
2757
  if isinstance(cloud, clouds.SSH):
@@ -2781,7 +2782,7 @@ def combine_pod_config_fields(
2781
2782
  kubernetes_config)
2782
2783
 
2783
2784
  # Write the updated YAML back to the file
2784
- common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2785
+ yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2785
2786
 
2786
2787
 
2787
2788
  def combine_metadata_fields(cluster_yaml_path: str,
@@ -2795,7 +2796,7 @@ def combine_metadata_fields(cluster_yaml_path: str,
2795
2796
 
2796
2797
  with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2797
2798
  yaml_content = f.read()
2798
- yaml_obj = yaml.safe_load(yaml_content)
2799
+ yaml_obj = yaml_utils.safe_load(yaml_content)
2799
2800
 
2800
2801
  # Get custom_metadata from global config
2801
2802
  custom_metadata = skypilot_config.get_effective_region_config(
@@ -2833,7 +2834,7 @@ def combine_metadata_fields(cluster_yaml_path: str,
2833
2834
  config_utils.merge_k8s_configs(destination, custom_metadata)
2834
2835
 
2835
2836
  # Write the updated YAML back to the file
2836
- common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2837
+ yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2837
2838
 
2838
2839
 
2839
2840
  def merge_custom_metadata(
@@ -3689,7 +3690,7 @@ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
3689
3690
  """
3690
3691
  # TODO(kyuds): GC cache files
3691
3692
  with open(kubeconfig_path, 'r', encoding='utf-8') as file:
3692
- config = yaml.safe_load(file)
3693
+ config = yaml_utils.safe_load(file)
3693
3694
  normalized = yaml.dump(config, sort_keys=True)
3694
3695
  hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
3695
3696
  path = os.path.expanduser(
@@ -14,6 +14,8 @@ logger = sky_logging.init_logger(__name__)
14
14
 
15
15
  POLL_INTERVAL = 5
16
16
 
17
+ _MAX_OPERATIONS_TO_FETCH = 1000
18
+
17
19
 
18
20
  def retry(func):
19
21
  """Decorator to retry a function."""
@@ -100,15 +102,23 @@ def delete_cluster(name: str, region: str) -> None:
100
102
  def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
101
103
  """Lists instances associated with API key."""
102
104
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
103
- result = nebius.sync_call(
104
- service.list(
105
- nebius.compute().ListInstancesRequest(parent_id=project_id),
106
- timeout=nebius.READ_TIMEOUT))
107
-
108
- instances = result
105
+ page_token = ''
106
+ instances = []
107
+ while True:
108
+ result = nebius.sync_call(
109
+ service.list(nebius.compute().ListInstancesRequest(
110
+ parent_id=project_id,
111
+ page_size=100,
112
+ page_token=page_token,
113
+ ),
114
+ timeout=nebius.READ_TIMEOUT))
115
+ instances.extend(result.items)
116
+ if not result.next_page_token: # "" means no more pages
117
+ break
118
+ page_token = result.next_page_token
109
119
 
110
120
  instance_dict: Dict[str, Dict[str, Any]] = {}
111
- for instance in instances.items:
121
+ for instance in instances:
112
122
  info = {}
113
123
  info['status'] = instance.status.state.name
114
124
  info['name'] = instance.metadata.name
@@ -313,11 +323,43 @@ def launch(cluster_name_on_cloud: str,
313
323
  parent_id=project_id,
314
324
  name=instance_name,
315
325
  )))
326
+ instance_id = instance.metadata.id
316
327
  if instance.status.state.name == 'STARTING':
317
- instance_id = instance.metadata.id
318
328
  break
329
+
330
+ # All Instances initially have state=STOPPED and reconciling=True,
331
+ # so we need to wait until reconciling is False.
332
+ if instance.status.state.name == 'STOPPED' and \
333
+ not instance.status.reconciling:
334
+ next_token = ''
335
+ total_operations = 0
336
+ while True:
337
+ operations_response = nebius.sync_call(
338
+ service.list_operations_by_parent(
339
+ nebius.compute().ListOperationsByParentRequest(
340
+ parent_id=project_id,
341
+ page_size=100,
342
+ page_token=next_token,
343
+ )))
344
+ total_operations += len(operations_response.operations)
345
+ for operation in operations_response.operations:
346
+ # Find the most recent operation for the instance.
347
+ if operation.resource_id == instance_id:
348
+ error_msg = operation.description
349
+ if operation.status:
350
+ error_msg += f' {operation.status.message}'
351
+ raise RuntimeError(error_msg)
352
+ # If we've fetched too many operations, or there are no more
353
+ # operations to fetch, just raise a generic error.
354
+ if total_operations > _MAX_OPERATIONS_TO_FETCH or \
355
+ not operations_response.next_page_token:
356
+ raise RuntimeError(
357
+ f'Instance {instance_name} failed to start.')
358
+ next_token = operations_response.next_page_token
319
359
  time.sleep(POLL_INTERVAL)
320
- logger.debug(f'Waiting for instance {instance_name} start running.')
360
+ logger.debug(f'Waiting for instance {instance_name} to start running. '
361
+ f'State: {instance.status.state.name}, '
362
+ f'Reconciling: {instance.status.reconciling}')
321
363
  retry_count += 1
322
364
 
323
365
  if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
@@ -3,12 +3,10 @@
3
3
  import http.cookies as http_cookies
4
4
  import os
5
5
  import ssl
6
- import typing
7
6
  from typing import Any, Dict, List, Optional
8
7
 
9
8
  from sky import exceptions
10
9
  from sky import sky_logging
11
- from sky.adaptors import common as adaptors_common
12
10
  from sky.adaptors import vsphere as vsphere_adaptor
13
11
  from sky.catalog import vsphere_catalog
14
12
  from sky.catalog.common import get_catalog_path
@@ -28,11 +26,7 @@ from sky.provision.vsphere.common.vim_utils import create_spec_with_script
28
26
  from sky.provision.vsphere.common.vim_utils import poweron_vm
29
27
  from sky.provision.vsphere.common.vim_utils import wait_for_tasks
30
28
  from sky.provision.vsphere.common.vim_utils import wait_internal_ip_ready
31
-
32
- if typing.TYPE_CHECKING:
33
- import yaml
34
- else:
35
- yaml = adaptors_common.LazyImport('yaml')
29
+ from sky.utils import yaml_utils
36
30
 
37
31
  logger = sky_logging.init_logger(__name__)
38
32
 
@@ -323,7 +317,7 @@ def get_vsphere_credentials(name=None):
323
317
  assert os.path.exists(
324
318
  credential_path), f'Missing credential file at {credential_path}.'
325
319
  with open(credential_path, 'r', encoding='utf-8') as file:
326
- credential = yaml.safe_load(file)
320
+ credential = yaml_utils.safe_load(file)
327
321
  vcenters = credential['vcenters']
328
322
  if name is None:
329
323
  return vcenters
@@ -1,5 +1,6 @@
1
1
  """Responses for the API server."""
2
2
 
3
+ import enum
3
4
  from typing import Any, Dict, List, Optional
4
5
 
5
6
  import pydantic
@@ -117,3 +118,9 @@ class StatusResponse(ResponseBaseModel):
117
118
  cpus: Optional[str] = None
118
119
  memory: Optional[str] = None
119
120
  accelerators: Optional[str] = None
121
+
122
+
123
+ class UploadStatus(enum.Enum):
124
+ """Status of the upload."""
125
+ UPLOADING = 'uploading'
126
+ COMPLETED = 'completed'
sky/serve/client/impl.py CHANGED
@@ -224,10 +224,11 @@ def tail_logs(service_name: str,
224
224
  stream=True)
225
225
  request_id: server_common.RequestId[None] = server_common.get_request_id(
226
226
  response)
227
- return sdk.stream_response(request_id=request_id,
228
- response=response,
229
- output_stream=output_stream,
230
- resumable=True)
227
+ sdk.stream_response(request_id=request_id,
228
+ response=response,
229
+ output_stream=output_stream,
230
+ resumable=True,
231
+ get_result=follow)
231
232
 
232
233
 
233
234
  def sync_down_logs(service_name: str,
@@ -37,6 +37,7 @@ from sky.utils import env_options
37
37
  from sky.utils import resources_utils
38
38
  from sky.utils import status_lib
39
39
  from sky.utils import ux_utils
40
+ from sky.utils import yaml_utils
40
41
 
41
42
  if typing.TYPE_CHECKING:
42
43
  from sky.serve import service_spec
@@ -79,7 +80,7 @@ def launch_cluster(replica_id: int,
79
80
  f'{cluster_name} with resources override: '
80
81
  f'{resources_override}')
81
82
  try:
82
- config = common_utils.read_yaml(
83
+ config = yaml_utils.read_yaml(
83
84
  os.path.expanduser(service_task_yaml_path))
84
85
  task = task_lib.Task.from_yaml_config(config)
85
86
  if resources_override is not None:
@@ -1397,7 +1398,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1397
1398
  # the latest version. This can significantly improve the speed
1398
1399
  # for updating an existing service with only config changes to the
1399
1400
  # service specs, e.g. scale down the service.
1400
- new_config = common_utils.read_yaml(
1401
+ new_config = yaml_utils.read_yaml(
1401
1402
  os.path.expanduser(service_task_yaml_path))
1402
1403
  # Always create new replicas and scale down old ones when file_mounts
1403
1404
  # are not empty.
@@ -1414,7 +1415,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1414
1415
  old_service_task_yaml_path = (
1415
1416
  serve_utils.generate_task_yaml_file_name(
1416
1417
  self._service_name, info.version))
1417
- old_config = common_utils.read_yaml(
1418
+ old_config = yaml_utils.read_yaml(
1418
1419
  os.path.expanduser(old_service_task_yaml_path))
1419
1420
  for key in ['service', 'pool', '_user_specified_yaml']:
1420
1421
  old_config.pop(key, None)
sky/serve/serve_utils.py CHANGED
@@ -20,7 +20,6 @@ import uuid
20
20
 
21
21
  import colorama
22
22
  import filelock
23
- import yaml
24
23
 
25
24
  from sky import backends
26
25
  from sky import exceptions
@@ -43,6 +42,7 @@ from sky.utils import message_utils
43
42
  from sky.utils import resources_utils
44
43
  from sky.utils import status_lib
45
44
  from sky.utils import ux_utils
45
+ from sky.utils import yaml_utils
46
46
 
47
47
  if typing.TYPE_CHECKING:
48
48
  import fastapi
@@ -699,7 +699,7 @@ def _get_service_status(
699
699
  if record['pool']:
700
700
  latest_yaml_path = generate_task_yaml_file_name(service_name,
701
701
  record['version'])
702
- raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
702
+ raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
703
703
  original_config = raw_yaml_config.get('_user_specified_yaml')
704
704
  if original_config is None:
705
705
  # Fall back to old display format.
@@ -710,8 +710,8 @@ def _get_service_status(
710
710
  svc.pop('pool', None) # Remove pool from service config
711
711
  original_config['pool'] = svc # Add pool to root config
712
712
  else:
713
- original_config = yaml.safe_load(original_config)
714
- record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
713
+ original_config = yaml_utils.safe_load(original_config)
714
+ record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
715
715
 
716
716
  record['target_num_replicas'] = 0
717
717
  try:
sky/serve/server/impl.py CHANGED
@@ -34,6 +34,7 @@ from sky.utils import dag_utils
34
34
  from sky.utils import rich_utils
35
35
  from sky.utils import subprocess_utils
36
36
  from sky.utils import ux_utils
37
+ from sky.utils import yaml_utils
37
38
 
38
39
  logger = sky_logging.init_logger(__name__)
39
40
 
@@ -179,7 +180,7 @@ def up(
179
180
  controller = controller_utils.get_controller_for_pool(pool)
180
181
  controller_name = controller.value.cluster_name
181
182
  task_config = task.to_yaml_config()
182
- common_utils.dump_yaml(service_file.name, task_config)
183
+ yaml_utils.dump_yaml(service_file.name, task_config)
183
184
  remote_tmp_task_yaml_path = (
184
185
  serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
185
186
  remote_config_yaml_path = (
@@ -531,7 +532,7 @@ def update(
531
532
  prefix=f'{service_name}-v{current_version}',
532
533
  mode='w') as service_file:
533
534
  task_config = task.to_yaml_config()
534
- common_utils.dump_yaml(service_file.name, task_config)
535
+ yaml_utils.dump_yaml(service_file.name, task_config)
535
536
  remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
536
537
  service_name, current_version, expand_user=False)
537
538
 
sky/serve/service_spec.py CHANGED
@@ -2,11 +2,9 @@
2
2
  import json
3
3
  import os
4
4
  import textwrap
5
- import typing
6
5
  from typing import Any, Dict, List, Optional, Union
7
6
 
8
7
  from sky import serve
9
- from sky.adaptors import common as adaptors_common
10
8
  from sky.serve import constants
11
9
  from sky.serve import load_balancing_policies as lb_policies
12
10
  from sky.serve import serve_utils
@@ -14,11 +12,7 @@ from sky.serve import spot_placer as spot_placer_lib
14
12
  from sky.utils import common_utils
15
13
  from sky.utils import schemas
16
14
  from sky.utils import ux_utils
17
-
18
- if typing.TYPE_CHECKING:
19
- import yaml
20
- else:
21
- yaml = adaptors_common.LazyImport('yaml')
15
+ from sky.utils import yaml_utils
22
16
 
23
17
 
24
18
  class SkyServiceSpec:
@@ -274,7 +268,7 @@ class SkyServiceSpec:
274
268
  @staticmethod
275
269
  def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
276
270
  with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
277
- config = yaml.safe_load(f)
271
+ config = yaml_utils.safe_load(f)
278
272
 
279
273
  if isinstance(config, str):
280
274
  with ux_utils.print_exception_no_traceback():
sky/server/auth/authn.py CHANGED
@@ -14,6 +14,10 @@ logger = sky_logging.init_logger(__name__)
14
14
  # TODO(hailong): Remove this function and use request.state.auth_user instead.
15
15
  async def override_user_info_in_request_body(request: fastapi.Request,
16
16
  auth_user: Optional[models.User]):
17
+ # Skip for upload requests to avoid consuming the body prematurely, which
18
+ # will break the streaming upload.
19
+ if request.url.path.startswith('/upload'):
20
+ return
17
21
  if auth_user is None:
18
22
  return
19
23
 
@@ -4,6 +4,7 @@ import asyncio
4
4
  import hashlib
5
5
  import http
6
6
  import os
7
+ import traceback
7
8
  from typing import Optional
8
9
  import urllib
9
10
 
@@ -109,8 +110,8 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
109
110
  try:
110
111
  return await self._authenticate(request, call_next, session)
111
112
  except (aiohttp.ClientError, asyncio.TimeoutError) as e:
112
- logger.error(f'Error communicating with OAuth2 proxy: {e}')
113
- # Fail open or closed based on your security requirements
113
+ logger.error(f'Error communicating with OAuth2 proxy: {e}'
114
+ f'{traceback.format_exc()}')
114
115
  return fastapi.responses.JSONResponse(
115
116
  status_code=http.HTTPStatus.BAD_GATEWAY,
116
117
  content={'detail': 'oauth2-proxy service unavailable'})
@@ -120,10 +121,15 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
120
121
  forwarded_headers = dict(request.headers)
121
122
  auth_url = f'{self.proxy_base}/oauth2/auth'
122
123
  forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
123
- logger.debug(f'authenticate request: {request.url.path}')
124
+ # Remove content-length and content-type headers and drop request body
125
+ # to reduce the auth overhead.
126
+ forwarded_headers.pop('content-length', None)
127
+ forwarded_headers.pop('content-type', None)
128
+ logger.debug(f'authenticate request: {auth_url}, '
129
+ f'headers: {forwarded_headers}')
124
130
 
125
131
  async with session.request(
126
- method=request.method,
132
+ method='GET',
127
133
  url=auth_url,
128
134
  headers=forwarded_headers,
129
135
  cookies=request.cookies,
sky/server/common.py CHANGED
@@ -23,6 +23,7 @@ import uuid
23
23
  import cachetools
24
24
  import colorama
25
25
  import filelock
26
+ from passlib import context as passlib_context
26
27
  from typing_extensions import ParamSpec
27
28
 
28
29
  from sky import exceptions
@@ -40,6 +41,7 @@ from sky.utils import annotations
40
41
  from sky.utils import common_utils
41
42
  from sky.utils import rich_utils
42
43
  from sky.utils import ux_utils
44
+ from sky.utils import yaml_utils
43
45
 
44
46
  if typing.TYPE_CHECKING:
45
47
  import aiohttp
@@ -61,7 +63,7 @@ AVAILABLE_LOCAL_API_SERVER_URLS = [
61
63
 
62
64
  API_SERVER_CMD = '-m sky.server.server'
63
65
  # The client dir on the API server for storing user-specific data, such as file
64
- # mounts, logs, etc. This dir is empheral and will be cleaned up when the API
66
+ # mounts, logs, etc. This dir is ephemeral and will be cleaned up when the API
65
67
  # server is restarted.
66
68
  API_SERVER_CLIENT_DIR = pathlib.Path('~/.sky/api_server/clients')
67
69
  RETRY_COUNT_ON_TIMEOUT = 3
@@ -102,6 +104,11 @@ logger = sky_logging.init_logger(__name__)
102
104
 
103
105
  hinted_for_server_install_version_mismatch = False
104
106
 
107
+ crypt_ctx = passlib_context.CryptContext([
108
+ 'bcrypt', 'sha256_crypt', 'sha512_crypt', 'des_crypt', 'apr_md5_crypt',
109
+ 'ldap_sha1'
110
+ ])
111
+
105
112
 
106
113
  class ApiServerStatus(enum.Enum):
107
114
  HEALTHY = 'healthy'
@@ -810,7 +817,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
810
817
  return str(client_file_mounts_dir /
811
818
  file_mounts_mapping[original_path].lstrip('/'))
812
819
 
813
- task_configs = common_utils.read_yaml_all(str(client_task_path))
820
+ task_configs = yaml_utils.read_yaml_all(str(client_task_path))
814
821
  for task_config in task_configs:
815
822
  if task_config is None:
816
823
  continue
@@ -863,7 +870,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
863
870
  # We can switch to using string, but this is to make it easier to debug, by
864
871
  # persisting the translated task yaml file.
865
872
  translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
866
- common_utils.dump_yaml(str(translated_client_task_path), task_configs)
873
+ yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
867
874
 
868
875
  dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
869
876
  return dag
sky/server/daemons.py CHANGED
@@ -191,23 +191,28 @@ INTERNAL_REQUEST_DAEMONS = [
191
191
  # set to updated status automatically, without showing users the hint of
192
192
  # cluster being stopped or down when `sky status -r` is called.
193
193
  InternalRequestDaemon(id='skypilot-status-refresh-daemon',
194
- name='status',
194
+ name='status-refresh',
195
195
  event_fn=refresh_cluster_status_event,
196
196
  default_log_level='DEBUG'),
197
197
  # Volume status refresh daemon to update the volume status periodically.
198
198
  InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
199
- name='volume',
199
+ name='volume-refresh',
200
200
  event_fn=refresh_volume_status_event),
201
201
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
202
- name='managed-job-status',
202
+ name='managed-job-status-refresh',
203
203
  event_fn=managed_job_status_refresh_event,
204
204
  should_skip=should_skip_managed_job_status_refresh),
205
205
  InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
206
- name='sky-serve-status',
206
+ name='sky-serve-status-refresh',
207
207
  event_fn=sky_serve_status_refresh_event,
208
208
  should_skip=should_skip_sky_serve_status_refresh),
209
209
  InternalRequestDaemon(id='pool-status-refresh-daemon',
210
- name='pool-status',
210
+ name='pool-status-refresh',
211
211
  event_fn=pool_status_refresh_event,
212
212
  should_skip=should_skip_pool_status_refresh),
213
213
  ]
214
+
215
+
216
+ def is_daemon_request_id(request_id: str) -> bool:
217
+ """Returns whether a specific request_id is an internal daemon."""
218
+ return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])
@@ -55,6 +55,7 @@ from sky.utils import context_utils
55
55
  from sky.utils import subprocess_utils
56
56
  from sky.utils import tempstore
57
57
  from sky.utils import timeline
58
+ from sky.utils import yaml_utils
58
59
  from sky.workspaces import core as workspaces_core
59
60
 
60
61
  if typing.TYPE_CHECKING:
@@ -275,6 +276,10 @@ def override_request_env_and_config(
275
276
  request_id: str) -> Generator[None, None, None]:
276
277
  """Override the environment and SkyPilot config for a request."""
277
278
  original_env = os.environ.copy()
279
+ # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
280
+ # affecting client requests. If set on the client side, it will be
281
+ # overridden by the request body.
282
+ os.environ.pop('SKYPILOT_DEBUG', None)
278
283
  os.environ.update(request_body.env_vars)
279
284
  # Note: may be overridden by AuthProxyMiddleware.
280
285
  # TODO(zhwu): we need to make the entire request a context available to the
@@ -383,7 +388,7 @@ def _request_execution_wrapper(request_id: str,
383
388
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
384
389
  config = skypilot_config.to_dict()
385
390
  logger.debug(f'request config: \n'
386
- f'{common_utils.dump_yaml_str(dict(config))}')
391
+ f'{yaml_utils.dump_yaml_str(dict(config))}')
387
392
  return_value = func(**request_body.to_kwargs())
388
393
  f.flush()
389
394
  except KeyboardInterrupt:
@@ -565,6 +565,27 @@ def get_request_tasks(
565
565
  return requests
566
566
 
567
567
 
568
+ @init_db
569
+ def get_api_request_ids_start_with(incomplete: str) -> List[str]:
570
+ """Get a list of API request ids for shell completion."""
571
+ assert _DB is not None
572
+ with _DB.conn:
573
+ cursor = _DB.conn.cursor()
574
+ # Prioritize alive requests (PENDING, RUNNING) over finished ones,
575
+ # then order by creation time (newest first) within each category.
576
+ cursor.execute(
577
+ f"""SELECT request_id FROM {REQUEST_TABLE}
578
+ WHERE request_id LIKE ?
579
+ ORDER BY
580
+ CASE
581
+ WHEN status IN ('PENDING', 'RUNNING') THEN 0
582
+ ELSE 1
583
+ END,
584
+ created_at DESC
585
+ LIMIT 1000""", (f'{incomplete}%',))
586
+ return [row[0] for row in cursor.fetchall()]
587
+
588
+
568
589
  def _add_or_update_request_no_lock(request: Request):
569
590
  """Add or update a REST request into the database."""
570
591
  row = request.to_row()