skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +194 -69
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +104 -53
- sky/client/sdk.py +13 -5
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +7 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-16ba1d7187d2e3b1.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +217 -36
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/controller.py +7 -7
- sky/jobs/server/core.py +3 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +31 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +4 -1
- sky/setup_files/setup.py +44 -44
- sky/templates/kubernetes-ray.yml.j2 +16 -15
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +22 -14
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +107 -107
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
|
@@ -30,9 +30,10 @@ HEAD_NODE_VALUE = '1'
|
|
|
30
30
|
WORKER_NODE_VALUE = '0'
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def run_instances(region: str, cluster_name: str,
|
|
33
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
34
34
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
35
35
|
"""See sky/provision/__init__.py"""
|
|
36
|
+
del cluster_name # unused
|
|
36
37
|
logger.info('New provision of Vsphere: run_instances().')
|
|
37
38
|
|
|
38
39
|
resumed_instance_ids: List[str] = []
|
|
@@ -40,7 +41,7 @@ def run_instances(region: str, cluster_name: str,
|
|
|
40
41
|
vc_object = _get_vc_object(region)
|
|
41
42
|
vc_object.connect()
|
|
42
43
|
|
|
43
|
-
exist_instances = _get_filtered_instance(vc_object,
|
|
44
|
+
exist_instances = _get_filtered_instance(vc_object, cluster_name_on_cloud,
|
|
44
45
|
config.provider_config)
|
|
45
46
|
head_instance_id = _get_head_instance_id(exist_instances)
|
|
46
47
|
|
|
@@ -89,8 +90,8 @@ def run_instances(region: str, cluster_name: str,
|
|
|
89
90
|
config, region, vc_object)
|
|
90
91
|
# TODO: update logic for multi-node creation
|
|
91
92
|
for _ in range(to_start_num):
|
|
92
|
-
created_instance_uuid = _create_instances(
|
|
93
|
-
region, vc_object,
|
|
93
|
+
created_instance_uuid = _create_instances(cluster_name_on_cloud,
|
|
94
|
+
config, region, vc_object,
|
|
94
95
|
vsphere_cluster_name)
|
|
95
96
|
created_instance_ids.append(created_instance_uuid)
|
|
96
97
|
if head_instance_id is None:
|
|
@@ -104,7 +105,7 @@ def run_instances(region: str, cluster_name: str,
|
|
|
104
105
|
provider_name='vsphere',
|
|
105
106
|
region=region,
|
|
106
107
|
zone=vsphere_cluster_name,
|
|
107
|
-
cluster_name=
|
|
108
|
+
cluster_name=cluster_name_on_cloud,
|
|
108
109
|
head_instance_id=head_instance_id,
|
|
109
110
|
resumed_instance_ids=resumed_instance_ids,
|
|
110
111
|
created_instance_ids=created_instance_ids,
|
sky/schemas/api/responses.py
CHANGED
|
@@ -86,7 +86,7 @@ class StatusResponse(ResponseBaseModel):
|
|
|
86
86
|
# backends.ResourceHandle, so we use Any here.
|
|
87
87
|
# This is an internally facing field anyway, so it's less
|
|
88
88
|
# of a problem that it's not typed.
|
|
89
|
-
handle: Any
|
|
89
|
+
handle: Optional[Any] = None
|
|
90
90
|
last_use: str
|
|
91
91
|
status: status_lib.ClusterStatus
|
|
92
92
|
autostop: int
|
|
@@ -118,6 +118,7 @@ class StatusResponse(ResponseBaseModel):
|
|
|
118
118
|
cpus: Optional[str] = None
|
|
119
119
|
memory: Optional[str] = None
|
|
120
120
|
accelerators: Optional[str] = None
|
|
121
|
+
cluster_name_on_cloud: Optional[str] = None
|
|
121
122
|
|
|
122
123
|
|
|
123
124
|
class UploadStatus(enum.Enum):
|
sky/serve/autoscalers.py
CHANGED
|
@@ -411,6 +411,8 @@ class _AutoscalerWithHysteresis(Autoscaler):
|
|
|
411
411
|
# `_set_target_num_replicas_with_hysteresis` to have the replicas
|
|
412
412
|
# quickly scale after each update.
|
|
413
413
|
self.target_num_replicas = self._calculate_target_num_replicas()
|
|
414
|
+
logger.debug(f'Target number of replicas: {self.target_num_replicas}'
|
|
415
|
+
'after update_version.')
|
|
414
416
|
# Cleanup hysteresis counters.
|
|
415
417
|
self.upscale_counter = 0
|
|
416
418
|
self.downscale_counter = 0
|
sky/serve/client/impl.py
CHANGED
|
@@ -105,7 +105,8 @@ def update(
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
def apply(
|
|
108
|
-
task: Union['sky.Task', 'sky.Dag'],
|
|
108
|
+
task: Optional[Union['sky.Task', 'sky.Dag']],
|
|
109
|
+
workers: Optional[int],
|
|
109
110
|
service_name: str,
|
|
110
111
|
mode: 'serve_utils.UpdateMode',
|
|
111
112
|
pool: bool = False,
|
|
@@ -117,35 +118,60 @@ def apply(
|
|
|
117
118
|
# Avoid circular import.
|
|
118
119
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
119
120
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
abort=True,
|
|
133
|
-
show_default=True)
|
|
134
|
-
|
|
135
|
-
dag = client_common.upload_mounts_to_api_server(dag)
|
|
136
|
-
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
|
121
|
+
noun = 'pool' if pool else 'service'
|
|
122
|
+
# There are two cases here. If task is None, we should be trying to
|
|
123
|
+
# update the number of workers in the pool. If task is not None, we should
|
|
124
|
+
# be trying to apply a new config to the pool. The two code paths
|
|
125
|
+
# are slightly different with us needing to craft the dag and validate
|
|
126
|
+
# it if we have a task. In the future we could move this logic to the
|
|
127
|
+
# server side and simplify this code, for the time being we keep it here.
|
|
128
|
+
if task is None:
|
|
129
|
+
if workers is None:
|
|
130
|
+
raise ValueError(f'Cannot create a new {noun} without specifying '
|
|
131
|
+
f'task or workers. Please provide either a task '
|
|
132
|
+
f'or specify the number of workers.')
|
|
137
133
|
|
|
138
134
|
body = payloads.JobsPoolApplyBody(
|
|
139
|
-
|
|
135
|
+
workers=workers,
|
|
140
136
|
pool_name=service_name,
|
|
141
137
|
mode=mode,
|
|
142
138
|
)
|
|
139
|
+
|
|
143
140
|
response = server_common.make_authenticated_request(
|
|
144
141
|
'POST',
|
|
145
142
|
'/jobs/pool_apply',
|
|
146
143
|
json=json.loads(body.model_dump_json()),
|
|
147
144
|
timeout=(5, None))
|
|
148
145
|
return server_common.get_request_id(response)
|
|
146
|
+
else:
|
|
147
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
148
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
149
|
+
dag, at_client_side=True) as dag:
|
|
150
|
+
sdk.validate(dag)
|
|
151
|
+
request_id = sdk.optimize(dag)
|
|
152
|
+
sdk.stream_and_get(request_id)
|
|
153
|
+
if _need_confirmation:
|
|
154
|
+
prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
|
|
155
|
+
if prompt is not None:
|
|
156
|
+
click.confirm(prompt,
|
|
157
|
+
default=True,
|
|
158
|
+
abort=True,
|
|
159
|
+
show_default=True)
|
|
160
|
+
|
|
161
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
|
162
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
|
163
|
+
|
|
164
|
+
body = payloads.JobsPoolApplyBody(
|
|
165
|
+
task=dag_str,
|
|
166
|
+
pool_name=service_name,
|
|
167
|
+
mode=mode,
|
|
168
|
+
)
|
|
169
|
+
response = server_common.make_authenticated_request(
|
|
170
|
+
'POST',
|
|
171
|
+
'/jobs/pool_apply',
|
|
172
|
+
json=json.loads(body.model_dump_json()),
|
|
173
|
+
timeout=(5, None))
|
|
174
|
+
return server_common.get_request_id(response)
|
|
149
175
|
|
|
150
176
|
|
|
151
177
|
def down(
|
sky/serve/replica_managers.py
CHANGED
|
@@ -422,11 +422,12 @@ class ReplicaInfo:
|
|
|
422
422
|
based on the cluster name.
|
|
423
423
|
"""
|
|
424
424
|
if cluster_record is None:
|
|
425
|
-
|
|
425
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
426
426
|
self.cluster_name)
|
|
427
|
-
|
|
427
|
+
else:
|
|
428
|
+
handle = cluster_record['handle']
|
|
429
|
+
if handle is None:
|
|
428
430
|
return None
|
|
429
|
-
handle = cluster_record['handle']
|
|
430
431
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
431
432
|
return handle
|
|
432
433
|
|
|
@@ -443,6 +444,12 @@ class ReplicaInfo:
|
|
|
443
444
|
handle = self.handle()
|
|
444
445
|
if handle is None:
|
|
445
446
|
return None
|
|
447
|
+
if self.replica_port == '-':
|
|
448
|
+
# This is a pool replica so there is no endpoint and it's filled
|
|
449
|
+
# with this dummy value. We return None here so that we can
|
|
450
|
+
# get the active ready replicas and perform autoscaling. Otherwise,
|
|
451
|
+
# would error out when trying to get the endpoint.
|
|
452
|
+
return None
|
|
446
453
|
replica_port_int = int(self.replica_port)
|
|
447
454
|
try:
|
|
448
455
|
endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
|
|
@@ -470,7 +477,7 @@ class ReplicaInfo:
|
|
|
470
477
|
with_handle: bool,
|
|
471
478
|
with_url: bool = True) -> Dict[str, Any]:
|
|
472
479
|
cluster_record = global_user_state.get_cluster_from_name(
|
|
473
|
-
self.cluster_name)
|
|
480
|
+
self.cluster_name, include_user_info=False, summary_response=True)
|
|
474
481
|
info_dict = {
|
|
475
482
|
'replica_id': self.replica_id,
|
|
476
483
|
'name': self.cluster_name,
|
|
@@ -956,7 +963,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
956
963
|
# provision) or the cluster is preempted and cleaned up by the status
|
|
957
964
|
# refresh. In this case, we skip spawning a new down process to save
|
|
958
965
|
# controller resources.
|
|
959
|
-
if global_user_state.
|
|
966
|
+
if not global_user_state.cluster_with_name_exists(info.cluster_name):
|
|
960
967
|
self._handle_sky_down_finish(info, exitcode=0)
|
|
961
968
|
return
|
|
962
969
|
|
sky/serve/serve_utils.py
CHANGED
|
@@ -262,7 +262,7 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
|
262
262
|
controller = controller_utils.get_controller_for_pool(pool).value
|
|
263
263
|
if current_is_consolidation_mode:
|
|
264
264
|
controller_cn = controller.cluster_name
|
|
265
|
-
if global_user_state.
|
|
265
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
266
266
|
with ux_utils.print_exception_no_traceback():
|
|
267
267
|
raise exceptions.InconsistentConsolidationModeError(
|
|
268
268
|
f'{colorama.Fore.RED}Consolidation mode for '
|
|
@@ -896,8 +896,8 @@ def _terminate_failed_services(
|
|
|
896
896
|
# replicas, so we don't need to try again here.
|
|
897
897
|
for replica_info in serve_state.get_replica_infos(service_name):
|
|
898
898
|
# TODO(tian): Refresh latest status of the cluster.
|
|
899
|
-
if global_user_state.
|
|
900
|
-
replica_info.cluster_name)
|
|
899
|
+
if global_user_state.cluster_with_name_exists(
|
|
900
|
+
replica_info.cluster_name):
|
|
901
901
|
remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
|
|
902
902
|
serve_state.remove_replica(service_name, replica_info.replica_id)
|
|
903
903
|
|
|
@@ -1133,10 +1133,8 @@ def _process_line(line: str,
|
|
|
1133
1133
|
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
1134
1134
|
# We should tail the detailed logs for user.
|
|
1135
1135
|
def cluster_is_up() -> bool:
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
return False
|
|
1139
|
-
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
1136
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
1137
|
+
return status == status_lib.ClusterStatus.UP
|
|
1140
1138
|
|
|
1141
1139
|
provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
|
|
1142
1140
|
line)
|
sky/serve/server/core.py
CHANGED
|
@@ -46,20 +46,23 @@ def up(
|
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
@usage_lib.entrypoint
|
|
49
|
-
def update(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
49
|
+
def update(task: Optional['sky.Task'],
|
|
50
|
+
service_name: str,
|
|
51
|
+
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
52
|
+
workers: Optional[int] = None) -> None:
|
|
53
53
|
"""Updates an existing service.
|
|
54
54
|
|
|
55
55
|
Please refer to the sky.cli.serve_update for the document.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
task: sky.Task to update
|
|
58
|
+
task: sky.Task to update, or None if updating
|
|
59
|
+
the number of workers/replicas.
|
|
59
60
|
service_name: Name of the service.
|
|
60
61
|
mode: Update mode.
|
|
62
|
+
workers: Number of workers/replicas to set for the service when
|
|
63
|
+
task is None.
|
|
61
64
|
"""
|
|
62
|
-
return impl.update(task, service_name, mode, pool=False)
|
|
65
|
+
return impl.update(task, service_name, mode, pool=False, workers=workers)
|
|
63
66
|
|
|
64
67
|
|
|
65
68
|
@usage_lib.entrypoint
|
sky/serve/server/impl.py
CHANGED
|
@@ -411,6 +411,9 @@ def up(
|
|
|
411
411
|
f'\n{ux_utils.INDENT_LAST_SYMBOL}To terminate the pool:\t'
|
|
412
412
|
f'{ux_utils.BOLD}sky jobs pool down {service_name}'
|
|
413
413
|
f'{ux_utils.RESET_BOLD}'
|
|
414
|
+
f'\n{ux_utils.INDENT_SYMBOL}To update the number of workers:\t'
|
|
415
|
+
f'{ux_utils.BOLD}sky jobs pool apply --pool {service_name} '
|
|
416
|
+
f'--workers 5{ux_utils.RESET_BOLD}'
|
|
414
417
|
'\n\n' + ux_utils.finishing_message('Successfully created pool '
|
|
415
418
|
f'{service_name!r}.'))
|
|
416
419
|
else:
|
|
@@ -448,37 +451,15 @@ def up(
|
|
|
448
451
|
|
|
449
452
|
|
|
450
453
|
def update(
|
|
451
|
-
task: 'task_lib.Task',
|
|
454
|
+
task: Optional['task_lib.Task'],
|
|
452
455
|
service_name: str,
|
|
453
456
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
454
457
|
pool: bool = False,
|
|
458
|
+
workers: Optional[int] = None,
|
|
455
459
|
) -> None:
|
|
456
460
|
"""Updates an existing service or pool."""
|
|
457
461
|
noun = 'pool' if pool else 'service'
|
|
458
462
|
capnoun = noun.capitalize()
|
|
459
|
-
task.validate()
|
|
460
|
-
serve_utils.validate_service_task(task, pool=pool)
|
|
461
|
-
|
|
462
|
-
# Always apply the policy again here, even though it might have been applied
|
|
463
|
-
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
464
|
-
# and get the mutated config.
|
|
465
|
-
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
|
466
|
-
# will not apply the config.
|
|
467
|
-
dag, _ = admin_policy_utils.apply(task)
|
|
468
|
-
task = dag.tasks[0]
|
|
469
|
-
if pool:
|
|
470
|
-
if task.run is not None:
|
|
471
|
-
logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
|
|
472
|
-
f'ignored for pool.{colorama.Style.RESET_ALL}')
|
|
473
|
-
# Use dummy run script for cluster pool.
|
|
474
|
-
task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
|
|
475
|
-
|
|
476
|
-
assert task.service is not None
|
|
477
|
-
if not pool and task.service.tls_credential is not None:
|
|
478
|
-
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
|
479
|
-
'Any updates to the keyfile and certfile will not take '
|
|
480
|
-
'effect. To update TLS keyfile and certfile, please '
|
|
481
|
-
'tear down the service and spin up a new one.')
|
|
482
463
|
|
|
483
464
|
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
484
465
|
handle = backend_utils.is_controller_accessible(
|
|
@@ -505,6 +486,77 @@ def update(
|
|
|
505
486
|
f'To spin up a {noun}, use {ux_utils.BOLD}'
|
|
506
487
|
f'{cmd}{ux_utils.RESET_BOLD}')
|
|
507
488
|
|
|
489
|
+
# If task is None and workers is specified, load existing configuration
|
|
490
|
+
# and update replica count.
|
|
491
|
+
if task is None:
|
|
492
|
+
if workers is None:
|
|
493
|
+
with ux_utils.print_exception_no_traceback():
|
|
494
|
+
raise ValueError(
|
|
495
|
+
f'Cannot update {noun} without specifying '
|
|
496
|
+
f'task or workers. Please provide either a task '
|
|
497
|
+
f'or specify the number of workers.')
|
|
498
|
+
|
|
499
|
+
if not pool:
|
|
500
|
+
with ux_utils.print_exception_no_traceback():
|
|
501
|
+
raise ValueError(
|
|
502
|
+
'Non-pool service, trying to update replicas to '
|
|
503
|
+
f'{workers} is not supported. Ignoring the update.')
|
|
504
|
+
|
|
505
|
+
# Load the existing task configuration from the service's YAML file
|
|
506
|
+
latest_yaml_path = serve_utils.generate_task_yaml_file_name(
|
|
507
|
+
service_name, service_record['version'], expand_user=False)
|
|
508
|
+
|
|
509
|
+
logger.debug('Loading existing task configuration from '
|
|
510
|
+
f'{latest_yaml_path} to create a new modified task.')
|
|
511
|
+
|
|
512
|
+
# Get the path locally.
|
|
513
|
+
with tempfile.NamedTemporaryFile(
|
|
514
|
+
prefix=f'service-task-{service_name}-',
|
|
515
|
+
mode='w',
|
|
516
|
+
) as service_file:
|
|
517
|
+
try:
|
|
518
|
+
backend.download_file(handle, latest_yaml_path,
|
|
519
|
+
service_file.name)
|
|
520
|
+
except exceptions.CommandError as e:
|
|
521
|
+
raise RuntimeError(
|
|
522
|
+
f'Failed to download the old task configuration from '
|
|
523
|
+
f'{latest_yaml_path}: {e.error_msg}') from e
|
|
524
|
+
|
|
525
|
+
# Load the existing task configuration
|
|
526
|
+
existing_config = yaml_utils.read_yaml(service_file.name)
|
|
527
|
+
task = task_lib.Task.from_yaml_config(existing_config)
|
|
528
|
+
|
|
529
|
+
if task.service is None:
|
|
530
|
+
with ux_utils.print_exception_no_traceback():
|
|
531
|
+
raise RuntimeError('No service configuration found in '
|
|
532
|
+
f'existing {noun} {service_name!r}')
|
|
533
|
+
task.set_service(task.service.copy(min_replicas=workers))
|
|
534
|
+
|
|
535
|
+
task.validate()
|
|
536
|
+
serve_utils.validate_service_task(task, pool=pool)
|
|
537
|
+
|
|
538
|
+
# Now apply the policy and handle task-specific logic
|
|
539
|
+
# Always apply the policy again here, even though it might have been applied
|
|
540
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
541
|
+
# and get the mutated config.
|
|
542
|
+
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
|
543
|
+
# will not apply the config.
|
|
544
|
+
dag, _ = admin_policy_utils.apply(task)
|
|
545
|
+
task = dag.tasks[0]
|
|
546
|
+
if pool:
|
|
547
|
+
if task.run is not None:
|
|
548
|
+
logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
|
|
549
|
+
f'ignored for pool.{colorama.Style.RESET_ALL}')
|
|
550
|
+
# Use dummy run script for cluster pool.
|
|
551
|
+
task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
|
|
552
|
+
|
|
553
|
+
assert task.service is not None
|
|
554
|
+
if not pool and task.service.tls_credential is not None:
|
|
555
|
+
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
|
556
|
+
'Any updates to the keyfile and certfile will not take '
|
|
557
|
+
'effect. To update TLS keyfile and certfile, please '
|
|
558
|
+
'tear down the service and spin up a new one.')
|
|
559
|
+
|
|
508
560
|
prompt = None
|
|
509
561
|
if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
|
|
510
562
|
):
|
|
@@ -625,6 +677,7 @@ def update(
|
|
|
625
677
|
|
|
626
678
|
def apply(
|
|
627
679
|
task: 'task_lib.Task',
|
|
680
|
+
workers: Optional[int],
|
|
628
681
|
service_name: str,
|
|
629
682
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
630
683
|
pool: bool = False,
|
|
@@ -640,7 +693,7 @@ def apply(
|
|
|
640
693
|
service_record = _get_service_record(service_name, pool, handle,
|
|
641
694
|
backend)
|
|
642
695
|
if service_record is not None:
|
|
643
|
-
return update(task, service_name, mode, pool)
|
|
696
|
+
return update(task, service_name, mode, pool, workers)
|
|
644
697
|
except exceptions.ClusterNotUpError:
|
|
645
698
|
pass
|
|
646
699
|
up(task, service_name, pool)
|
sky/serve/server/server.py
CHANGED
|
@@ -98,7 +98,7 @@ async def tail_logs(
|
|
|
98
98
|
request: fastapi.Request, log_body: payloads.ServeLogsBody,
|
|
99
99
|
background_tasks: fastapi.BackgroundTasks
|
|
100
100
|
) -> fastapi.responses.StreamingResponse:
|
|
101
|
-
executor.
|
|
101
|
+
request_task = executor.prepare_request(
|
|
102
102
|
request_id=request.state.request_id,
|
|
103
103
|
request_name='serve.logs',
|
|
104
104
|
request_body=log_body,
|
|
@@ -106,10 +106,9 @@ async def tail_logs(
|
|
|
106
106
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
107
107
|
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
108
108
|
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
109
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
110
|
+
# Cancel the coroutine after the request is done or client disconnects
|
|
111
|
+
background_tasks.add_task(task.cancel)
|
|
113
112
|
return stream_utils.stream_response(
|
|
114
113
|
request_id=request_task.request_id,
|
|
115
114
|
logs_path=request_task.log_path,
|
sky/serve/service_spec.py
CHANGED
|
@@ -506,3 +506,36 @@ class SkyServiceSpec:
|
|
|
506
506
|
if not hasattr(self, '_pool'):
|
|
507
507
|
return False
|
|
508
508
|
return bool(self._pool)
|
|
509
|
+
|
|
510
|
+
def copy(self, **override) -> 'SkyServiceSpec':
|
|
511
|
+
return SkyServiceSpec(
|
|
512
|
+
readiness_path=override.pop('readiness_path', self._readiness_path),
|
|
513
|
+
initial_delay_seconds=override.pop('initial_delay_seconds',
|
|
514
|
+
self._initial_delay_seconds),
|
|
515
|
+
readiness_timeout_seconds=override.pop(
|
|
516
|
+
'readiness_timeout_seconds', self._readiness_timeout_seconds),
|
|
517
|
+
min_replicas=override.pop('min_replicas', self._min_replicas),
|
|
518
|
+
max_replicas=override.pop('max_replicas', self._max_replicas),
|
|
519
|
+
num_overprovision=override.pop('num_overprovision',
|
|
520
|
+
self._num_overprovision),
|
|
521
|
+
ports=override.pop('ports', self._ports),
|
|
522
|
+
target_qps_per_replica=override.pop('target_qps_per_replica',
|
|
523
|
+
self._target_qps_per_replica),
|
|
524
|
+
post_data=override.pop('post_data', self._post_data),
|
|
525
|
+
tls_credential=override.pop('tls_credential', self._tls_credential),
|
|
526
|
+
readiness_headers=override.pop('readiness_headers',
|
|
527
|
+
self._readiness_headers),
|
|
528
|
+
dynamic_ondemand_fallback=override.pop(
|
|
529
|
+
'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
|
|
530
|
+
base_ondemand_fallback_replicas=override.pop(
|
|
531
|
+
'base_ondemand_fallback_replicas',
|
|
532
|
+
self._base_ondemand_fallback_replicas),
|
|
533
|
+
spot_placer=override.pop('spot_placer', self._spot_placer),
|
|
534
|
+
upscale_delay_seconds=override.pop('upscale_delay_seconds',
|
|
535
|
+
self._upscale_delay_seconds),
|
|
536
|
+
downscale_delay_seconds=override.pop('downscale_delay_seconds',
|
|
537
|
+
self._downscale_delay_seconds),
|
|
538
|
+
load_balancing_policy=override.pop('load_balancing_policy',
|
|
539
|
+
self._load_balancing_policy),
|
|
540
|
+
pool=override.pop('pool', self._pool),
|
|
541
|
+
)
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 20
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/daemons.py
CHANGED
|
@@ -8,7 +8,6 @@ from sky import sky_logging
|
|
|
8
8
|
from sky import skypilot_config
|
|
9
9
|
from sky.server import constants as server_constants
|
|
10
10
|
from sky.utils import annotations
|
|
11
|
-
from sky.utils import common
|
|
12
11
|
from sky.utils import common_utils
|
|
13
12
|
from sky.utils import env_options
|
|
14
13
|
from sky.utils import subprocess_utils
|
|
@@ -94,13 +93,13 @@ class InternalRequestDaemon:
|
|
|
94
93
|
def refresh_cluster_status_event():
|
|
95
94
|
"""Periodically refresh the cluster status."""
|
|
96
95
|
# pylint: disable=import-outside-toplevel
|
|
97
|
-
from sky import
|
|
96
|
+
from sky.backends import backend_utils
|
|
98
97
|
|
|
99
98
|
logger.info('=== Refreshing cluster status ===')
|
|
100
99
|
# This periodically refresh will hold the lock for the cluster being
|
|
101
100
|
# refreshed, but it is OK because other operations will just wait for
|
|
102
101
|
# the lock and get the just refreshed status without refreshing again.
|
|
103
|
-
|
|
102
|
+
backend_utils.refresh_cluster_records()
|
|
104
103
|
logger.info('Status refreshed. Sleeping '
|
|
105
104
|
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
106
105
|
' seconds for the next refresh...\n')
|
sky/server/requests/executor.py
CHANGED
|
@@ -502,7 +502,35 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
|
502
502
|
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
503
503
|
|
|
504
504
|
|
|
505
|
-
|
|
505
|
+
class CoroutineTask:
|
|
506
|
+
"""Wrapper of a background task runs in coroutine"""
|
|
507
|
+
|
|
508
|
+
def __init__(self, task: asyncio.Task):
|
|
509
|
+
self.task = task
|
|
510
|
+
|
|
511
|
+
async def cancel(self):
|
|
512
|
+
try:
|
|
513
|
+
self.task.cancel()
|
|
514
|
+
await self.task
|
|
515
|
+
except asyncio.CancelledError:
|
|
516
|
+
pass
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def execute_request_in_coroutine(
|
|
520
|
+
request: api_requests.Request) -> CoroutineTask:
|
|
521
|
+
"""Execute a request in current event loop.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
request: The request to execute.
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
A CoroutineTask handle to operate the background task.
|
|
528
|
+
"""
|
|
529
|
+
task = asyncio.create_task(_execute_request_coroutine(request))
|
|
530
|
+
return CoroutineTask(task)
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
async def _execute_request_coroutine(request: api_requests.Request):
|
|
506
534
|
"""Execute a request in current event loop.
|
|
507
535
|
|
|
508
536
|
Similar to _request_execution_wrapper, but executed as coroutine in current
|
|
@@ -640,13 +668,35 @@ def schedule_request(request_id: str,
|
|
|
640
668
|
The precondition is waited asynchronously and does not block the
|
|
641
669
|
caller.
|
|
642
670
|
"""
|
|
643
|
-
prepare_request(request_id, request_name, request_body, func,
|
|
644
|
-
|
|
671
|
+
request_task = prepare_request(request_id, request_name, request_body, func,
|
|
672
|
+
request_cluster_name, schedule_type,
|
|
673
|
+
is_skypilot_system)
|
|
674
|
+
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
675
|
+
retryable)
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def schedule_prepared_request(request_task: api_requests.Request,
|
|
679
|
+
ignore_return_value: bool = False,
|
|
680
|
+
precondition: Optional[
|
|
681
|
+
preconditions.Precondition] = None,
|
|
682
|
+
retryable: bool = False) -> None:
|
|
683
|
+
"""Enqueue a request to the request queue
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
request_task: The prepared request task to schedule.
|
|
687
|
+
ignore_return_value: If True, the return value of the function will be
|
|
688
|
+
ignored.
|
|
689
|
+
precondition: If a precondition is provided, the request will only be
|
|
690
|
+
scheduled for execution when the precondition is met (returns True).
|
|
691
|
+
The precondition is waited asynchronously and does not block the
|
|
692
|
+
caller.
|
|
693
|
+
retryable: Whether the request should be retried if it fails.
|
|
694
|
+
"""
|
|
645
695
|
|
|
646
696
|
def enqueue():
|
|
647
|
-
input_tuple = (request_id, ignore_return_value, retryable)
|
|
648
|
-
logger.info(f'Queuing request: {request_id}')
|
|
649
|
-
_get_queue(schedule_type).put(input_tuple)
|
|
697
|
+
input_tuple = (request_task.request_id, ignore_return_value, retryable)
|
|
698
|
+
logger.info(f'Queuing request: {request_task.request_id}')
|
|
699
|
+
_get_queue(request_task.schedule_type).put(input_tuple)
|
|
650
700
|
|
|
651
701
|
if precondition is not None:
|
|
652
702
|
# Wait async to avoid blocking caller.
|
sky/server/requests/payloads.py
CHANGED
|
@@ -316,6 +316,9 @@ class StatusBody(RequestBody):
|
|
|
316
316
|
all_users: bool = True
|
|
317
317
|
# TODO (kyuds): default to False post 0.10.5
|
|
318
318
|
include_credentials: bool = True
|
|
319
|
+
# Only return fields that are needed for the
|
|
320
|
+
# dashboard / CLI summary response
|
|
321
|
+
summary_response: bool = False
|
|
319
322
|
|
|
320
323
|
|
|
321
324
|
class StartBody(RequestBody):
|
|
@@ -475,6 +478,17 @@ class VolumeListBody(RequestBody):
|
|
|
475
478
|
pass
|
|
476
479
|
|
|
477
480
|
|
|
481
|
+
class VolumeValidateBody(RequestBody):
|
|
482
|
+
"""The request body for the volume validate endpoint."""
|
|
483
|
+
name: Optional[str] = None
|
|
484
|
+
volume_type: Optional[str] = None
|
|
485
|
+
infra: Optional[str] = None
|
|
486
|
+
size: Optional[str] = None
|
|
487
|
+
labels: Optional[Dict[str, str]] = None
|
|
488
|
+
resource_name: Optional[str] = None
|
|
489
|
+
config: Optional[Dict[str, Any]] = None
|
|
490
|
+
|
|
491
|
+
|
|
478
492
|
class EndpointsBody(RequestBody):
|
|
479
493
|
"""The request body for the endpoint."""
|
|
480
494
|
cluster: str
|
|
@@ -669,9 +683,15 @@ class LocalUpBody(RequestBody):
|
|
|
669
683
|
ssh_key: Optional[str] = None
|
|
670
684
|
cleanup: bool = False
|
|
671
685
|
context_name: Optional[str] = None
|
|
686
|
+
name: Optional[str] = None
|
|
672
687
|
password: Optional[str] = None
|
|
673
688
|
|
|
674
689
|
|
|
690
|
+
class LocalDownBody(RequestBody):
|
|
691
|
+
"""The request body for the local down endpoint."""
|
|
692
|
+
name: Optional[str] = None
|
|
693
|
+
|
|
694
|
+
|
|
675
695
|
class SSHUpBody(RequestBody):
|
|
676
696
|
"""The request body for the SSH up/down endpoints."""
|
|
677
697
|
infra: Optional[str] = None
|
|
@@ -709,19 +729,22 @@ class JobsDownloadLogsBody(RequestBody):
|
|
|
709
729
|
|
|
710
730
|
class JobsPoolApplyBody(RequestBody):
|
|
711
731
|
"""The request body for the jobs pool apply endpoint."""
|
|
712
|
-
task: str
|
|
732
|
+
task: Optional[str] = None
|
|
733
|
+
workers: Optional[int] = None
|
|
713
734
|
pool_name: str
|
|
714
735
|
mode: serve.UpdateMode
|
|
715
736
|
|
|
716
737
|
def to_kwargs(self) -> Dict[str, Any]:
|
|
717
738
|
kwargs = super().to_kwargs()
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
739
|
+
if self.task is not None:
|
|
740
|
+
dag = common.process_mounts_in_task_on_api_server(
|
|
741
|
+
self.task, self.env_vars, workdir_only=False)
|
|
742
|
+
assert len(
|
|
743
|
+
dag.tasks) == 1, ('Must only specify one task in the DAG for '
|
|
744
|
+
'a pool.', dag)
|
|
745
|
+
kwargs['task'] = dag.tasks[0]
|
|
746
|
+
else:
|
|
747
|
+
kwargs['task'] = None
|
|
725
748
|
return kwargs
|
|
726
749
|
|
|
727
750
|
|