skypilot-nightly 1.0.0.dev20250826__py3-none-any.whl → 1.0.0.dev20250828__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -10
- sky/authentication.py +4 -10
- sky/backends/backend.py +3 -5
- sky/backends/backend_utils.py +41 -56
- sky/backends/cloud_vm_ray_backend.py +13 -24
- sky/backends/local_docker_backend.py +3 -8
- sky/client/cli/command.py +43 -10
- sky/client/common.py +41 -14
- sky/client/sdk.py +24 -9
- sky/client/sdk_async.py +6 -2
- sky/clouds/aws.py +1 -1
- sky/clouds/cloud.py +15 -0
- sky/clouds/kubernetes.py +27 -0
- sky/clouds/ssh.py +2 -3
- sky/core.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6e76f636a048e145.js → webpack-6dae1cd599a34def.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +127 -23
- sky/jobs/client/sdk.py +5 -2
- sky/jobs/recovery_strategy.py +9 -4
- sky/logs/agent.py +2 -2
- sky/logs/aws.py +6 -3
- sky/provision/do/utils.py +2 -1
- sky/provision/kubernetes/config.py +2 -8
- sky/provision/kubernetes/instance.py +58 -8
- sky/provision/kubernetes/network_utils.py +3 -4
- sky/provision/kubernetes/utils.py +8 -7
- sky/provision/nebius/utils.py +51 -9
- sky/provision/vsphere/vsphere_utils.py +2 -8
- sky/schemas/api/responses.py +7 -0
- sky/serve/client/impl.py +5 -4
- sky/serve/replica_managers.py +4 -3
- sky/serve/serve_utils.py +4 -4
- sky/serve/server/impl.py +3 -2
- sky/serve/service_spec.py +2 -8
- sky/server/auth/authn.py +4 -0
- sky/server/auth/oauth2_proxy.py +10 -4
- sky/server/common.py +10 -3
- sky/server/daemons.py +10 -5
- sky/server/requests/executor.py +6 -1
- sky/server/requests/requests.py +21 -0
- sky/server/server.py +34 -33
- sky/server/uvicorn.py +33 -0
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +4 -1
- sky/skylet/events.py +4 -5
- sky/skypilot_config.py +14 -12
- sky/ssh_node_pools/core.py +3 -1
- sky/task.py +4 -10
- sky/templates/nebius-ray.yml.j2 +4 -8
- sky/usage/usage_lib.py +3 -2
- sky/users/server.py +6 -6
- sky/utils/common_utils.py +0 -71
- sky/utils/controller_utils.py +4 -3
- sky/utils/dag_utils.py +4 -4
- sky/utils/kubernetes/config_map_utils.py +3 -3
- sky/utils/schemas.py +3 -0
- sky/utils/yaml_utils.py +102 -0
- sky/volumes/volume.py +8 -3
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/RECORD +83 -82
- /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/top_level.txt +0 -0
|
@@ -38,6 +38,7 @@ from sky.utils import schemas
|
|
|
38
38
|
from sky.utils import status_lib
|
|
39
39
|
from sky.utils import timeline
|
|
40
40
|
from sky.utils import ux_utils
|
|
41
|
+
from sky.utils import yaml_utils
|
|
41
42
|
|
|
42
43
|
if typing.TYPE_CHECKING:
|
|
43
44
|
import jinja2
|
|
@@ -1898,7 +1899,7 @@ def is_kubeconfig_exec_auth(
|
|
|
1898
1899
|
|
|
1899
1900
|
# Load the kubeconfig for the context
|
|
1900
1901
|
kubeconfig_text = _get_kubeconfig_text_for_context(context)
|
|
1901
|
-
kubeconfig =
|
|
1902
|
+
kubeconfig = yaml_utils.safe_load(kubeconfig_text)
|
|
1902
1903
|
|
|
1903
1904
|
# Get the user details
|
|
1904
1905
|
user_details = kubeconfig['users']
|
|
@@ -2601,7 +2602,7 @@ def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
|
|
|
2601
2602
|
image=ssh_jump_image,
|
|
2602
2603
|
secret=ssh_key_secret,
|
|
2603
2604
|
service_type=service_type)
|
|
2604
|
-
content =
|
|
2605
|
+
content = yaml_utils.safe_load(cont)
|
|
2605
2606
|
return content
|
|
2606
2607
|
|
|
2607
2608
|
|
|
@@ -2750,7 +2751,7 @@ def combine_pod_config_fields(
|
|
|
2750
2751
|
"""
|
|
2751
2752
|
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2752
2753
|
yaml_content = f.read()
|
|
2753
|
-
yaml_obj =
|
|
2754
|
+
yaml_obj = yaml_utils.safe_load(yaml_content)
|
|
2754
2755
|
# We don't use override_configs in `get_effective_region_config`, as merging
|
|
2755
2756
|
# the pod config requires special handling.
|
|
2756
2757
|
if isinstance(cloud, clouds.SSH):
|
|
@@ -2781,7 +2782,7 @@ def combine_pod_config_fields(
|
|
|
2781
2782
|
kubernetes_config)
|
|
2782
2783
|
|
|
2783
2784
|
# Write the updated YAML back to the file
|
|
2784
|
-
|
|
2785
|
+
yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2785
2786
|
|
|
2786
2787
|
|
|
2787
2788
|
def combine_metadata_fields(cluster_yaml_path: str,
|
|
@@ -2795,7 +2796,7 @@ def combine_metadata_fields(cluster_yaml_path: str,
|
|
|
2795
2796
|
|
|
2796
2797
|
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2797
2798
|
yaml_content = f.read()
|
|
2798
|
-
yaml_obj =
|
|
2799
|
+
yaml_obj = yaml_utils.safe_load(yaml_content)
|
|
2799
2800
|
|
|
2800
2801
|
# Get custom_metadata from global config
|
|
2801
2802
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
@@ -2833,7 +2834,7 @@ def combine_metadata_fields(cluster_yaml_path: str,
|
|
|
2833
2834
|
config_utils.merge_k8s_configs(destination, custom_metadata)
|
|
2834
2835
|
|
|
2835
2836
|
# Write the updated YAML back to the file
|
|
2836
|
-
|
|
2837
|
+
yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2837
2838
|
|
|
2838
2839
|
|
|
2839
2840
|
def merge_custom_metadata(
|
|
@@ -3689,7 +3690,7 @@ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
|
|
|
3689
3690
|
"""
|
|
3690
3691
|
# TODO(kyuds): GC cache files
|
|
3691
3692
|
with open(kubeconfig_path, 'r', encoding='utf-8') as file:
|
|
3692
|
-
config =
|
|
3693
|
+
config = yaml_utils.safe_load(file)
|
|
3693
3694
|
normalized = yaml.dump(config, sort_keys=True)
|
|
3694
3695
|
hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
|
|
3695
3696
|
path = os.path.expanduser(
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -14,6 +14,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
14
14
|
|
|
15
15
|
POLL_INTERVAL = 5
|
|
16
16
|
|
|
17
|
+
_MAX_OPERATIONS_TO_FETCH = 1000
|
|
18
|
+
|
|
17
19
|
|
|
18
20
|
def retry(func):
|
|
19
21
|
"""Decorator to retry a function."""
|
|
@@ -100,15 +102,23 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
100
102
|
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
101
103
|
"""Lists instances associated with API key."""
|
|
102
104
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
105
|
+
page_token = ''
|
|
106
|
+
instances = []
|
|
107
|
+
while True:
|
|
108
|
+
result = nebius.sync_call(
|
|
109
|
+
service.list(nebius.compute().ListInstancesRequest(
|
|
110
|
+
parent_id=project_id,
|
|
111
|
+
page_size=100,
|
|
112
|
+
page_token=page_token,
|
|
113
|
+
),
|
|
114
|
+
timeout=nebius.READ_TIMEOUT))
|
|
115
|
+
instances.extend(result.items)
|
|
116
|
+
if not result.next_page_token: # "" means no more pages
|
|
117
|
+
break
|
|
118
|
+
page_token = result.next_page_token
|
|
109
119
|
|
|
110
120
|
instance_dict: Dict[str, Dict[str, Any]] = {}
|
|
111
|
-
for instance in instances
|
|
121
|
+
for instance in instances:
|
|
112
122
|
info = {}
|
|
113
123
|
info['status'] = instance.status.state.name
|
|
114
124
|
info['name'] = instance.metadata.name
|
|
@@ -313,11 +323,43 @@ def launch(cluster_name_on_cloud: str,
|
|
|
313
323
|
parent_id=project_id,
|
|
314
324
|
name=instance_name,
|
|
315
325
|
)))
|
|
326
|
+
instance_id = instance.metadata.id
|
|
316
327
|
if instance.status.state.name == 'STARTING':
|
|
317
|
-
instance_id = instance.metadata.id
|
|
318
328
|
break
|
|
329
|
+
|
|
330
|
+
# All Instances initially have state=STOPPED and reconciling=True,
|
|
331
|
+
# so we need to wait until reconciling is False.
|
|
332
|
+
if instance.status.state.name == 'STOPPED' and \
|
|
333
|
+
not instance.status.reconciling:
|
|
334
|
+
next_token = ''
|
|
335
|
+
total_operations = 0
|
|
336
|
+
while True:
|
|
337
|
+
operations_response = nebius.sync_call(
|
|
338
|
+
service.list_operations_by_parent(
|
|
339
|
+
nebius.compute().ListOperationsByParentRequest(
|
|
340
|
+
parent_id=project_id,
|
|
341
|
+
page_size=100,
|
|
342
|
+
page_token=next_token,
|
|
343
|
+
)))
|
|
344
|
+
total_operations += len(operations_response.operations)
|
|
345
|
+
for operation in operations_response.operations:
|
|
346
|
+
# Find the most recent operation for the instance.
|
|
347
|
+
if operation.resource_id == instance_id:
|
|
348
|
+
error_msg = operation.description
|
|
349
|
+
if operation.status:
|
|
350
|
+
error_msg += f' {operation.status.message}'
|
|
351
|
+
raise RuntimeError(error_msg)
|
|
352
|
+
# If we've fetched too many operations, or there are no more
|
|
353
|
+
# operations to fetch, just raise a generic error.
|
|
354
|
+
if total_operations > _MAX_OPERATIONS_TO_FETCH or \
|
|
355
|
+
not operations_response.next_page_token:
|
|
356
|
+
raise RuntimeError(
|
|
357
|
+
f'Instance {instance_name} failed to start.')
|
|
358
|
+
next_token = operations_response.next_page_token
|
|
319
359
|
time.sleep(POLL_INTERVAL)
|
|
320
|
-
logger.debug(f'Waiting for instance {instance_name} start running.'
|
|
360
|
+
logger.debug(f'Waiting for instance {instance_name} to start running. '
|
|
361
|
+
f'State: {instance.status.state.name}, '
|
|
362
|
+
f'Reconciling: {instance.status.reconciling}')
|
|
321
363
|
retry_count += 1
|
|
322
364
|
|
|
323
365
|
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
@@ -3,12 +3,10 @@
|
|
|
3
3
|
import http.cookies as http_cookies
|
|
4
4
|
import os
|
|
5
5
|
import ssl
|
|
6
|
-
import typing
|
|
7
6
|
from typing import Any, Dict, List, Optional
|
|
8
7
|
|
|
9
8
|
from sky import exceptions
|
|
10
9
|
from sky import sky_logging
|
|
11
|
-
from sky.adaptors import common as adaptors_common
|
|
12
10
|
from sky.adaptors import vsphere as vsphere_adaptor
|
|
13
11
|
from sky.catalog import vsphere_catalog
|
|
14
12
|
from sky.catalog.common import get_catalog_path
|
|
@@ -28,11 +26,7 @@ from sky.provision.vsphere.common.vim_utils import create_spec_with_script
|
|
|
28
26
|
from sky.provision.vsphere.common.vim_utils import poweron_vm
|
|
29
27
|
from sky.provision.vsphere.common.vim_utils import wait_for_tasks
|
|
30
28
|
from sky.provision.vsphere.common.vim_utils import wait_internal_ip_ready
|
|
31
|
-
|
|
32
|
-
if typing.TYPE_CHECKING:
|
|
33
|
-
import yaml
|
|
34
|
-
else:
|
|
35
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
29
|
+
from sky.utils import yaml_utils
|
|
36
30
|
|
|
37
31
|
logger = sky_logging.init_logger(__name__)
|
|
38
32
|
|
|
@@ -323,7 +317,7 @@ def get_vsphere_credentials(name=None):
|
|
|
323
317
|
assert os.path.exists(
|
|
324
318
|
credential_path), f'Missing credential file at {credential_path}.'
|
|
325
319
|
with open(credential_path, 'r', encoding='utf-8') as file:
|
|
326
|
-
credential =
|
|
320
|
+
credential = yaml_utils.safe_load(file)
|
|
327
321
|
vcenters = credential['vcenters']
|
|
328
322
|
if name is None:
|
|
329
323
|
return vcenters
|
sky/schemas/api/responses.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Responses for the API server."""
|
|
2
2
|
|
|
3
|
+
import enum
|
|
3
4
|
from typing import Any, Dict, List, Optional
|
|
4
5
|
|
|
5
6
|
import pydantic
|
|
@@ -117,3 +118,9 @@ class StatusResponse(ResponseBaseModel):
|
|
|
117
118
|
cpus: Optional[str] = None
|
|
118
119
|
memory: Optional[str] = None
|
|
119
120
|
accelerators: Optional[str] = None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class UploadStatus(enum.Enum):
|
|
124
|
+
"""Status of the upload."""
|
|
125
|
+
UPLOADING = 'uploading'
|
|
126
|
+
COMPLETED = 'completed'
|
sky/serve/client/impl.py
CHANGED
|
@@ -224,10 +224,11 @@ def tail_logs(service_name: str,
|
|
|
224
224
|
stream=True)
|
|
225
225
|
request_id: server_common.RequestId[None] = server_common.get_request_id(
|
|
226
226
|
response)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
227
|
+
sdk.stream_response(request_id=request_id,
|
|
228
|
+
response=response,
|
|
229
|
+
output_stream=output_stream,
|
|
230
|
+
resumable=True,
|
|
231
|
+
get_result=follow)
|
|
231
232
|
|
|
232
233
|
|
|
233
234
|
def sync_down_logs(service_name: str,
|
sky/serve/replica_managers.py
CHANGED
|
@@ -37,6 +37,7 @@ from sky.utils import env_options
|
|
|
37
37
|
from sky.utils import resources_utils
|
|
38
38
|
from sky.utils import status_lib
|
|
39
39
|
from sky.utils import ux_utils
|
|
40
|
+
from sky.utils import yaml_utils
|
|
40
41
|
|
|
41
42
|
if typing.TYPE_CHECKING:
|
|
42
43
|
from sky.serve import service_spec
|
|
@@ -79,7 +80,7 @@ def launch_cluster(replica_id: int,
|
|
|
79
80
|
f'{cluster_name} with resources override: '
|
|
80
81
|
f'{resources_override}')
|
|
81
82
|
try:
|
|
82
|
-
config =
|
|
83
|
+
config = yaml_utils.read_yaml(
|
|
83
84
|
os.path.expanduser(service_task_yaml_path))
|
|
84
85
|
task = task_lib.Task.from_yaml_config(config)
|
|
85
86
|
if resources_override is not None:
|
|
@@ -1397,7 +1398,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1397
1398
|
# the latest version. This can significantly improve the speed
|
|
1398
1399
|
# for updating an existing service with only config changes to the
|
|
1399
1400
|
# service specs, e.g. scale down the service.
|
|
1400
|
-
new_config =
|
|
1401
|
+
new_config = yaml_utils.read_yaml(
|
|
1401
1402
|
os.path.expanduser(service_task_yaml_path))
|
|
1402
1403
|
# Always create new replicas and scale down old ones when file_mounts
|
|
1403
1404
|
# are not empty.
|
|
@@ -1414,7 +1415,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1414
1415
|
old_service_task_yaml_path = (
|
|
1415
1416
|
serve_utils.generate_task_yaml_file_name(
|
|
1416
1417
|
self._service_name, info.version))
|
|
1417
|
-
old_config =
|
|
1418
|
+
old_config = yaml_utils.read_yaml(
|
|
1418
1419
|
os.path.expanduser(old_service_task_yaml_path))
|
|
1419
1420
|
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1420
1421
|
old_config.pop(key, None)
|
sky/serve/serve_utils.py
CHANGED
|
@@ -20,7 +20,6 @@ import uuid
|
|
|
20
20
|
|
|
21
21
|
import colorama
|
|
22
22
|
import filelock
|
|
23
|
-
import yaml
|
|
24
23
|
|
|
25
24
|
from sky import backends
|
|
26
25
|
from sky import exceptions
|
|
@@ -43,6 +42,7 @@ from sky.utils import message_utils
|
|
|
43
42
|
from sky.utils import resources_utils
|
|
44
43
|
from sky.utils import status_lib
|
|
45
44
|
from sky.utils import ux_utils
|
|
45
|
+
from sky.utils import yaml_utils
|
|
46
46
|
|
|
47
47
|
if typing.TYPE_CHECKING:
|
|
48
48
|
import fastapi
|
|
@@ -699,7 +699,7 @@ def _get_service_status(
|
|
|
699
699
|
if record['pool']:
|
|
700
700
|
latest_yaml_path = generate_task_yaml_file_name(service_name,
|
|
701
701
|
record['version'])
|
|
702
|
-
raw_yaml_config =
|
|
702
|
+
raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
|
|
703
703
|
original_config = raw_yaml_config.get('_user_specified_yaml')
|
|
704
704
|
if original_config is None:
|
|
705
705
|
# Fall back to old display format.
|
|
@@ -710,8 +710,8 @@ def _get_service_status(
|
|
|
710
710
|
svc.pop('pool', None) # Remove pool from service config
|
|
711
711
|
original_config['pool'] = svc # Add pool to root config
|
|
712
712
|
else:
|
|
713
|
-
original_config =
|
|
714
|
-
record['pool_yaml'] =
|
|
713
|
+
original_config = yaml_utils.safe_load(original_config)
|
|
714
|
+
record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
|
|
715
715
|
|
|
716
716
|
record['target_num_replicas'] = 0
|
|
717
717
|
try:
|
sky/serve/server/impl.py
CHANGED
|
@@ -34,6 +34,7 @@ from sky.utils import dag_utils
|
|
|
34
34
|
from sky.utils import rich_utils
|
|
35
35
|
from sky.utils import subprocess_utils
|
|
36
36
|
from sky.utils import ux_utils
|
|
37
|
+
from sky.utils import yaml_utils
|
|
37
38
|
|
|
38
39
|
logger = sky_logging.init_logger(__name__)
|
|
39
40
|
|
|
@@ -179,7 +180,7 @@ def up(
|
|
|
179
180
|
controller = controller_utils.get_controller_for_pool(pool)
|
|
180
181
|
controller_name = controller.value.cluster_name
|
|
181
182
|
task_config = task.to_yaml_config()
|
|
182
|
-
|
|
183
|
+
yaml_utils.dump_yaml(service_file.name, task_config)
|
|
183
184
|
remote_tmp_task_yaml_path = (
|
|
184
185
|
serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
|
|
185
186
|
remote_config_yaml_path = (
|
|
@@ -531,7 +532,7 @@ def update(
|
|
|
531
532
|
prefix=f'{service_name}-v{current_version}',
|
|
532
533
|
mode='w') as service_file:
|
|
533
534
|
task_config = task.to_yaml_config()
|
|
534
|
-
|
|
535
|
+
yaml_utils.dump_yaml(service_file.name, task_config)
|
|
535
536
|
remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
|
|
536
537
|
service_name, current_version, expand_user=False)
|
|
537
538
|
|
sky/serve/service_spec.py
CHANGED
|
@@ -2,11 +2,9 @@
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import textwrap
|
|
5
|
-
import typing
|
|
6
5
|
from typing import Any, Dict, List, Optional, Union
|
|
7
6
|
|
|
8
7
|
from sky import serve
|
|
9
|
-
from sky.adaptors import common as adaptors_common
|
|
10
8
|
from sky.serve import constants
|
|
11
9
|
from sky.serve import load_balancing_policies as lb_policies
|
|
12
10
|
from sky.serve import serve_utils
|
|
@@ -14,11 +12,7 @@ from sky.serve import spot_placer as spot_placer_lib
|
|
|
14
12
|
from sky.utils import common_utils
|
|
15
13
|
from sky.utils import schemas
|
|
16
14
|
from sky.utils import ux_utils
|
|
17
|
-
|
|
18
|
-
if typing.TYPE_CHECKING:
|
|
19
|
-
import yaml
|
|
20
|
-
else:
|
|
21
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
15
|
+
from sky.utils import yaml_utils
|
|
22
16
|
|
|
23
17
|
|
|
24
18
|
class SkyServiceSpec:
|
|
@@ -274,7 +268,7 @@ class SkyServiceSpec:
|
|
|
274
268
|
@staticmethod
|
|
275
269
|
def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
|
|
276
270
|
with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
|
|
277
|
-
config =
|
|
271
|
+
config = yaml_utils.safe_load(f)
|
|
278
272
|
|
|
279
273
|
if isinstance(config, str):
|
|
280
274
|
with ux_utils.print_exception_no_traceback():
|
sky/server/auth/authn.py
CHANGED
|
@@ -14,6 +14,10 @@ logger = sky_logging.init_logger(__name__)
|
|
|
14
14
|
# TODO(hailong): Remove this function and use request.state.auth_user instead.
|
|
15
15
|
async def override_user_info_in_request_body(request: fastapi.Request,
|
|
16
16
|
auth_user: Optional[models.User]):
|
|
17
|
+
# Skip for upload requests to avoid consuming the body prematurely, which
|
|
18
|
+
# will break the streaming upload.
|
|
19
|
+
if request.url.path.startswith('/upload'):
|
|
20
|
+
return
|
|
17
21
|
if auth_user is None:
|
|
18
22
|
return
|
|
19
23
|
|
sky/server/auth/oauth2_proxy.py
CHANGED
|
@@ -4,6 +4,7 @@ import asyncio
|
|
|
4
4
|
import hashlib
|
|
5
5
|
import http
|
|
6
6
|
import os
|
|
7
|
+
import traceback
|
|
7
8
|
from typing import Optional
|
|
8
9
|
import urllib
|
|
9
10
|
|
|
@@ -109,8 +110,8 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
109
110
|
try:
|
|
110
111
|
return await self._authenticate(request, call_next, session)
|
|
111
112
|
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
112
|
-
logger.error(f'Error communicating with OAuth2 proxy: {e}'
|
|
113
|
-
|
|
113
|
+
logger.error(f'Error communicating with OAuth2 proxy: {e}'
|
|
114
|
+
f'{traceback.format_exc()}')
|
|
114
115
|
return fastapi.responses.JSONResponse(
|
|
115
116
|
status_code=http.HTTPStatus.BAD_GATEWAY,
|
|
116
117
|
content={'detail': 'oauth2-proxy service unavailable'})
|
|
@@ -120,10 +121,15 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
120
121
|
forwarded_headers = dict(request.headers)
|
|
121
122
|
auth_url = f'{self.proxy_base}/oauth2/auth'
|
|
122
123
|
forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
|
|
123
|
-
|
|
124
|
+
# Remove content-length and content-type headers and drop request body
|
|
125
|
+
# to reduce the auth overhead.
|
|
126
|
+
forwarded_headers.pop('content-length', None)
|
|
127
|
+
forwarded_headers.pop('content-type', None)
|
|
128
|
+
logger.debug(f'authenticate request: {auth_url}, '
|
|
129
|
+
f'headers: {forwarded_headers}')
|
|
124
130
|
|
|
125
131
|
async with session.request(
|
|
126
|
-
method=
|
|
132
|
+
method='GET',
|
|
127
133
|
url=auth_url,
|
|
128
134
|
headers=forwarded_headers,
|
|
129
135
|
cookies=request.cookies,
|
sky/server/common.py
CHANGED
|
@@ -23,6 +23,7 @@ import uuid
|
|
|
23
23
|
import cachetools
|
|
24
24
|
import colorama
|
|
25
25
|
import filelock
|
|
26
|
+
from passlib import context as passlib_context
|
|
26
27
|
from typing_extensions import ParamSpec
|
|
27
28
|
|
|
28
29
|
from sky import exceptions
|
|
@@ -40,6 +41,7 @@ from sky.utils import annotations
|
|
|
40
41
|
from sky.utils import common_utils
|
|
41
42
|
from sky.utils import rich_utils
|
|
42
43
|
from sky.utils import ux_utils
|
|
44
|
+
from sky.utils import yaml_utils
|
|
43
45
|
|
|
44
46
|
if typing.TYPE_CHECKING:
|
|
45
47
|
import aiohttp
|
|
@@ -61,7 +63,7 @@ AVAILABLE_LOCAL_API_SERVER_URLS = [
|
|
|
61
63
|
|
|
62
64
|
API_SERVER_CMD = '-m sky.server.server'
|
|
63
65
|
# The client dir on the API server for storing user-specific data, such as file
|
|
64
|
-
# mounts, logs, etc. This dir is
|
|
66
|
+
# mounts, logs, etc. This dir is ephemeral and will be cleaned up when the API
|
|
65
67
|
# server is restarted.
|
|
66
68
|
API_SERVER_CLIENT_DIR = pathlib.Path('~/.sky/api_server/clients')
|
|
67
69
|
RETRY_COUNT_ON_TIMEOUT = 3
|
|
@@ -102,6 +104,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
102
104
|
|
|
103
105
|
hinted_for_server_install_version_mismatch = False
|
|
104
106
|
|
|
107
|
+
crypt_ctx = passlib_context.CryptContext([
|
|
108
|
+
'bcrypt', 'sha256_crypt', 'sha512_crypt', 'des_crypt', 'apr_md5_crypt',
|
|
109
|
+
'ldap_sha1'
|
|
110
|
+
])
|
|
111
|
+
|
|
105
112
|
|
|
106
113
|
class ApiServerStatus(enum.Enum):
|
|
107
114
|
HEALTHY = 'healthy'
|
|
@@ -810,7 +817,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
|
810
817
|
return str(client_file_mounts_dir /
|
|
811
818
|
file_mounts_mapping[original_path].lstrip('/'))
|
|
812
819
|
|
|
813
|
-
task_configs =
|
|
820
|
+
task_configs = yaml_utils.read_yaml_all(str(client_task_path))
|
|
814
821
|
for task_config in task_configs:
|
|
815
822
|
if task_config is None:
|
|
816
823
|
continue
|
|
@@ -863,7 +870,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
|
863
870
|
# We can switch to using string, but this is to make it easier to debug, by
|
|
864
871
|
# persisting the translated task yaml file.
|
|
865
872
|
translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
|
|
866
|
-
|
|
873
|
+
yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
|
|
867
874
|
|
|
868
875
|
dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
|
|
869
876
|
return dag
|
sky/server/daemons.py
CHANGED
|
@@ -191,23 +191,28 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
|
191
191
|
# set to updated status automatically, without showing users the hint of
|
|
192
192
|
# cluster being stopped or down when `sky status -r` is called.
|
|
193
193
|
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
|
194
|
-
name='status',
|
|
194
|
+
name='status-refresh',
|
|
195
195
|
event_fn=refresh_cluster_status_event,
|
|
196
196
|
default_log_level='DEBUG'),
|
|
197
197
|
# Volume status refresh daemon to update the volume status periodically.
|
|
198
198
|
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
|
199
|
-
name='volume',
|
|
199
|
+
name='volume-refresh',
|
|
200
200
|
event_fn=refresh_volume_status_event),
|
|
201
201
|
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
202
|
-
name='managed-job-status',
|
|
202
|
+
name='managed-job-status-refresh',
|
|
203
203
|
event_fn=managed_job_status_refresh_event,
|
|
204
204
|
should_skip=should_skip_managed_job_status_refresh),
|
|
205
205
|
InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
|
|
206
|
-
name='sky-serve-status',
|
|
206
|
+
name='sky-serve-status-refresh',
|
|
207
207
|
event_fn=sky_serve_status_refresh_event,
|
|
208
208
|
should_skip=should_skip_sky_serve_status_refresh),
|
|
209
209
|
InternalRequestDaemon(id='pool-status-refresh-daemon',
|
|
210
|
-
name='pool-status',
|
|
210
|
+
name='pool-status-refresh',
|
|
211
211
|
event_fn=pool_status_refresh_event,
|
|
212
212
|
should_skip=should_skip_pool_status_refresh),
|
|
213
213
|
]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def is_daemon_request_id(request_id: str) -> bool:
|
|
217
|
+
"""Returns whether a specific request_id is an internal daemon."""
|
|
218
|
+
return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])
|
sky/server/requests/executor.py
CHANGED
|
@@ -55,6 +55,7 @@ from sky.utils import context_utils
|
|
|
55
55
|
from sky.utils import subprocess_utils
|
|
56
56
|
from sky.utils import tempstore
|
|
57
57
|
from sky.utils import timeline
|
|
58
|
+
from sky.utils import yaml_utils
|
|
58
59
|
from sky.workspaces import core as workspaces_core
|
|
59
60
|
|
|
60
61
|
if typing.TYPE_CHECKING:
|
|
@@ -275,6 +276,10 @@ def override_request_env_and_config(
|
|
|
275
276
|
request_id: str) -> Generator[None, None, None]:
|
|
276
277
|
"""Override the environment and SkyPilot config for a request."""
|
|
277
278
|
original_env = os.environ.copy()
|
|
279
|
+
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
|
|
280
|
+
# affecting client requests. If set on the client side, it will be
|
|
281
|
+
# overridden by the request body.
|
|
282
|
+
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
278
283
|
os.environ.update(request_body.env_vars)
|
|
279
284
|
# Note: may be overridden by AuthProxyMiddleware.
|
|
280
285
|
# TODO(zhwu): we need to make the entire request a context available to the
|
|
@@ -383,7 +388,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
383
388
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
384
389
|
config = skypilot_config.to_dict()
|
|
385
390
|
logger.debug(f'request config: \n'
|
|
386
|
-
f'{
|
|
391
|
+
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
387
392
|
return_value = func(**request_body.to_kwargs())
|
|
388
393
|
f.flush()
|
|
389
394
|
except KeyboardInterrupt:
|
sky/server/requests/requests.py
CHANGED
|
@@ -565,6 +565,27 @@ def get_request_tasks(
|
|
|
565
565
|
return requests
|
|
566
566
|
|
|
567
567
|
|
|
568
|
+
@init_db
|
|
569
|
+
def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
570
|
+
"""Get a list of API request ids for shell completion."""
|
|
571
|
+
assert _DB is not None
|
|
572
|
+
with _DB.conn:
|
|
573
|
+
cursor = _DB.conn.cursor()
|
|
574
|
+
# Prioritize alive requests (PENDING, RUNNING) over finished ones,
|
|
575
|
+
# then order by creation time (newest first) within each category.
|
|
576
|
+
cursor.execute(
|
|
577
|
+
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
578
|
+
WHERE request_id LIKE ?
|
|
579
|
+
ORDER BY
|
|
580
|
+
CASE
|
|
581
|
+
WHEN status IN ('PENDING', 'RUNNING') THEN 0
|
|
582
|
+
ELSE 1
|
|
583
|
+
END,
|
|
584
|
+
created_at DESC
|
|
585
|
+
LIMIT 1000""", (f'{incomplete}%',))
|
|
586
|
+
return [row[0] for row in cursor.fetchall()]
|
|
587
|
+
|
|
588
|
+
|
|
568
589
|
def _add_or_update_request_no_lock(request: Request):
|
|
569
590
|
"""Add or update a REST request into the database."""
|
|
570
591
|
row = request.to_row()
|