skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +452 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/provision/runpod/utils.py +27 -12
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +2 -107
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/sky_logging.py +30 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +47 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/serve/replica_managers.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""ReplicaManager: handles the creation and deletion of endpoint replicas."""
|
|
2
|
+
import collections
|
|
2
3
|
import dataclasses
|
|
3
4
|
import enum
|
|
4
5
|
import functools
|
|
@@ -23,6 +24,7 @@ from sky import execution
|
|
|
23
24
|
from sky import global_user_state
|
|
24
25
|
from sky import sky_logging
|
|
25
26
|
from sky.backends import backend_utils
|
|
27
|
+
from sky.jobs import scheduler as jobs_scheduler
|
|
26
28
|
from sky.serve import constants as serve_constants
|
|
27
29
|
from sky.serve import serve_state
|
|
28
30
|
from sky.serve import serve_utils
|
|
@@ -34,6 +36,7 @@ from sky.usage import usage_lib
|
|
|
34
36
|
from sky.utils import common_utils
|
|
35
37
|
from sky.utils import controller_utils
|
|
36
38
|
from sky.utils import env_options
|
|
39
|
+
from sky.utils import resources_utils
|
|
37
40
|
from sky.utils import status_lib
|
|
38
41
|
from sky.utils import ux_utils
|
|
39
42
|
|
|
@@ -45,8 +48,6 @@ logger = sky_logging.init_logger(__name__)
|
|
|
45
48
|
|
|
46
49
|
_JOB_STATUS_FETCH_INTERVAL = 30
|
|
47
50
|
_PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
48
|
-
# TODO(tian): Maybe let user determine this threshold
|
|
49
|
-
_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180
|
|
50
51
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
51
52
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
52
53
|
|
|
@@ -180,6 +181,8 @@ def _get_resources_ports(service_task_yaml_path: str) -> str:
|
|
|
180
181
|
# Already checked all ports are valid in sky.serve.core.up
|
|
181
182
|
assert task.resources, task
|
|
182
183
|
assert task.service is not None, task
|
|
184
|
+
if task.service.pool:
|
|
185
|
+
return '-'
|
|
183
186
|
assert task.service.ports is not None, task
|
|
184
187
|
return task.service.ports
|
|
185
188
|
|
|
@@ -445,8 +448,8 @@ class ReplicaInfo:
|
|
|
445
448
|
return None
|
|
446
449
|
replica_port_int = int(self.replica_port)
|
|
447
450
|
try:
|
|
448
|
-
endpoint_dict =
|
|
449
|
-
|
|
451
|
+
endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
|
|
452
|
+
replica_port_int)
|
|
450
453
|
except exceptions.ClusterNotUpError:
|
|
451
454
|
return None
|
|
452
455
|
endpoint = endpoint_dict.get(replica_port_int, None)
|
|
@@ -466,7 +469,9 @@ class ReplicaInfo:
|
|
|
466
469
|
f'replica {self.replica_id}.')
|
|
467
470
|
return replica_status
|
|
468
471
|
|
|
469
|
-
def to_info_dict(self,
|
|
472
|
+
def to_info_dict(self,
|
|
473
|
+
with_handle: bool,
|
|
474
|
+
with_url: bool = True) -> Dict[str, Any]:
|
|
470
475
|
cluster_record = global_user_state.get_cluster_from_name(
|
|
471
476
|
self.cluster_name)
|
|
472
477
|
info_dict = {
|
|
@@ -474,18 +479,26 @@ class ReplicaInfo:
|
|
|
474
479
|
'name': self.cluster_name,
|
|
475
480
|
'status': self.status,
|
|
476
481
|
'version': self.version,
|
|
477
|
-
'endpoint': self.url,
|
|
482
|
+
'endpoint': self.url if with_url else None,
|
|
478
483
|
'is_spot': self.is_spot,
|
|
479
484
|
'launched_at': (cluster_record['launched_at']
|
|
480
485
|
if cluster_record is not None else None),
|
|
481
486
|
}
|
|
482
487
|
if with_handle:
|
|
483
|
-
|
|
488
|
+
handle = self.handle(cluster_record)
|
|
489
|
+
info_dict['handle'] = handle
|
|
490
|
+
if handle is not None:
|
|
491
|
+
info_dict['cloud'] = repr(handle.launched_resources.cloud)
|
|
492
|
+
info_dict['region'] = handle.launched_resources.region
|
|
493
|
+
info_dict['resources_str'] = (
|
|
494
|
+
resources_utils.get_readable_resources_repr(handle,
|
|
495
|
+
simplify=True))
|
|
484
496
|
return info_dict
|
|
485
497
|
|
|
486
498
|
def __repr__(self) -> str:
|
|
487
|
-
|
|
488
|
-
|
|
499
|
+
show_details = env_options.Options.SHOW_DEBUG_INFO.get()
|
|
500
|
+
info_dict = self.to_info_dict(with_handle=show_details,
|
|
501
|
+
with_url=show_details)
|
|
489
502
|
handle_str = ''
|
|
490
503
|
if 'handle' in info_dict:
|
|
491
504
|
handle_str = f', handle={info_dict["handle"]}'
|
|
@@ -499,6 +512,33 @@ class ReplicaInfo:
|
|
|
499
512
|
f'launched_at={info_dict["launched_at"]}{handle_str})')
|
|
500
513
|
return info
|
|
501
514
|
|
|
515
|
+
def probe_pool(self) -> Tuple['ReplicaInfo', bool, float]:
|
|
516
|
+
"""Probe the replica for pool management.
|
|
517
|
+
|
|
518
|
+
This function will check the first job status of the cluster, which is a
|
|
519
|
+
dummy job that only echoes "setup done". The success of this job means
|
|
520
|
+
the setup command is done and the replica is ready to be used. Check
|
|
521
|
+
sky/serve/server/core.py::up for more details.
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Tuple of (self, is_ready, probe_time).
|
|
525
|
+
"""
|
|
526
|
+
probe_time = time.time()
|
|
527
|
+
try:
|
|
528
|
+
handle = backend_utils.check_cluster_available(
|
|
529
|
+
self.cluster_name, operation='probing pool')
|
|
530
|
+
if handle is None:
|
|
531
|
+
return self, False, probe_time
|
|
532
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
533
|
+
statuses = backend.get_job_status(handle, [1], stream_logs=False)
|
|
534
|
+
if statuses[1] == job_lib.JobStatus.SUCCEEDED:
|
|
535
|
+
return self, True, probe_time
|
|
536
|
+
return self, False, probe_time
|
|
537
|
+
except Exception as e: # pylint: disable=broad-except
|
|
538
|
+
logger.error(f'Error when probing pool of {self.cluster_name}: '
|
|
539
|
+
f'{common_utils.format_exception(e)}.')
|
|
540
|
+
return self, False, probe_time
|
|
541
|
+
|
|
502
542
|
def probe(
|
|
503
543
|
self,
|
|
504
544
|
readiness_path: str,
|
|
@@ -588,6 +628,7 @@ class ReplicaManager:
|
|
|
588
628
|
self._service_name: str = service_name
|
|
589
629
|
self._uptime: Optional[float] = None
|
|
590
630
|
self._update_mode = serve_utils.DEFAULT_UPDATE_MODE
|
|
631
|
+
self._is_pool: bool = spec.pool
|
|
591
632
|
header_keys = None
|
|
592
633
|
if spec.readiness_headers is not None:
|
|
593
634
|
header_keys = list(spec.readiness_headers.keys())
|
|
@@ -601,6 +642,15 @@ class ReplicaManager:
|
|
|
601
642
|
# Oldest version among the currently provisioned and launched replicas
|
|
602
643
|
self.least_recent_version: int = serve_constants.INITIAL_VERSION
|
|
603
644
|
|
|
645
|
+
def _consecutive_failure_threshold_timeout(self) -> int:
|
|
646
|
+
"""The timeout for the consecutive failure threshold in seconds.
|
|
647
|
+
|
|
648
|
+
We reduce the timeout for pool to 10 seconds to make the pool more
|
|
649
|
+
responsive to the failure.
|
|
650
|
+
"""
|
|
651
|
+
# TODO(tian): Maybe let user determine this threshold
|
|
652
|
+
return 10 if self._is_pool else 180
|
|
653
|
+
|
|
604
654
|
def scale_up(self,
|
|
605
655
|
resources_override: Optional[Dict[str, Any]] = None) -> None:
|
|
606
656
|
"""Scale up the service by 1 replica with resources_override.
|
|
@@ -822,9 +872,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
822
872
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
823
873
|
replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
824
874
|
'replica_jobs')
|
|
825
|
-
job_log_file_name = (
|
|
826
|
-
|
|
827
|
-
backend, handle, replica_job_logs_dir))
|
|
875
|
+
job_log_file_name = (controller_utils.download_and_stream_job_log(
|
|
876
|
+
backend, handle, replica_job_logs_dir))
|
|
828
877
|
if job_log_file_name is not None:
|
|
829
878
|
logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
|
|
830
879
|
with open(log_file_name, 'a',
|
|
@@ -937,6 +986,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
937
986
|
self._service_name, replica_id)
|
|
938
987
|
assert info is not None, replica_id
|
|
939
988
|
error_in_sky_launch = False
|
|
989
|
+
schedule_next_jobs = False
|
|
940
990
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
941
991
|
# sky.launch not started yet
|
|
942
992
|
if (serve_state.total_number_provisioning_replicas() <
|
|
@@ -965,6 +1015,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
965
1015
|
else:
|
|
966
1016
|
info.status_property.sky_launch_status = (
|
|
967
1017
|
ProcessStatus.SUCCEEDED)
|
|
1018
|
+
schedule_next_jobs = True
|
|
968
1019
|
if self._spot_placer is not None and info.is_spot:
|
|
969
1020
|
# TODO(tian): Currently, we set the location to
|
|
970
1021
|
# preemptive if the launch process failed. This is
|
|
@@ -984,6 +1035,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
984
1035
|
self._spot_placer.set_active(location)
|
|
985
1036
|
serve_state.add_or_update_replica(self._service_name,
|
|
986
1037
|
replica_id, info)
|
|
1038
|
+
if schedule_next_jobs and self._is_pool:
|
|
1039
|
+
jobs_scheduler.maybe_schedule_next_jobs(
|
|
1040
|
+
pool=self._service_name)
|
|
987
1041
|
if error_in_sky_launch:
|
|
988
1042
|
# Teardown after update replica info since
|
|
989
1043
|
# _terminate_replica will update the replica info too.
|
|
@@ -1100,9 +1154,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1100
1154
|
handle = info.handle()
|
|
1101
1155
|
assert handle is not None, info
|
|
1102
1156
|
# Use None to fetch latest job, which stands for user task job
|
|
1157
|
+
job_ids = [1] if self._is_pool else None
|
|
1103
1158
|
try:
|
|
1104
1159
|
job_statuses = backend.get_job_status(handle,
|
|
1105
|
-
|
|
1160
|
+
job_ids,
|
|
1106
1161
|
stream_logs=False)
|
|
1107
1162
|
except exceptions.CommandError:
|
|
1108
1163
|
# If the job status fetch failed, it is likely that the
|
|
@@ -1112,7 +1167,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1112
1167
|
continue
|
|
1113
1168
|
# Re-raise the exception if it is not preempted.
|
|
1114
1169
|
raise
|
|
1115
|
-
job_status =
|
|
1170
|
+
job_status = job_statuses[1] if self._is_pool else list(
|
|
1171
|
+
job_statuses.values())[0]
|
|
1116
1172
|
if job_status in job_lib.JobStatus.user_code_failure_states():
|
|
1117
1173
|
info.status_property.user_app_failed = True
|
|
1118
1174
|
serve_state.add_or_update_replica(self._service_name,
|
|
@@ -1156,18 +1212,24 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1156
1212
|
for info in infos:
|
|
1157
1213
|
if not info.status_property.should_track_service_status():
|
|
1158
1214
|
continue
|
|
1159
|
-
|
|
1160
|
-
f'replica_{info.replica_id}(
|
|
1161
|
-
|
|
1162
|
-
pool.apply_async(
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1215
|
+
if self._is_pool:
|
|
1216
|
+
replica_to_probe.append(f'replica_{info.replica_id}(cluster'
|
|
1217
|
+
f'_name={info.cluster_name})')
|
|
1218
|
+
probe_futures.append(pool.apply_async(info.probe_pool))
|
|
1219
|
+
else:
|
|
1220
|
+
replica_to_probe.append(
|
|
1221
|
+
f'replica_{info.replica_id}(url={info.url})')
|
|
1222
|
+
probe_futures.append(
|
|
1223
|
+
pool.apply_async(
|
|
1224
|
+
info.probe,
|
|
1225
|
+
(
|
|
1226
|
+
self._get_readiness_path(info.version),
|
|
1227
|
+
self._get_post_data(info.version),
|
|
1228
|
+
self._get_readiness_timeout_seconds(
|
|
1229
|
+
info.version),
|
|
1230
|
+
self._get_readiness_headers(info.version),
|
|
1231
|
+
),
|
|
1232
|
+
),)
|
|
1171
1233
|
logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}')
|
|
1172
1234
|
|
|
1173
1235
|
# Since futures.as_completed will return futures in the order of
|
|
@@ -1204,8 +1266,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1204
1266
|
consecutive_failure_time = (
|
|
1205
1267
|
info.consecutive_failure_times[-1] -
|
|
1206
1268
|
info.consecutive_failure_times[0])
|
|
1207
|
-
|
|
1208
|
-
|
|
1269
|
+
failure_threshold = (
|
|
1270
|
+
self._consecutive_failure_threshold_timeout())
|
|
1271
|
+
if consecutive_failure_time >= failure_threshold:
|
|
1209
1272
|
logger.info(
|
|
1210
1273
|
f'Replica {info.replica_id} is not ready for '
|
|
1211
1274
|
'too long and exceeding consecutive failure '
|
|
@@ -1216,8 +1279,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1216
1279
|
f'Replica {info.replica_id} is not ready '
|
|
1217
1280
|
'but within consecutive failure threshold '
|
|
1218
1281
|
f'({consecutive_failure_time}s / '
|
|
1219
|
-
f'{
|
|
1220
|
-
'Skipping.')
|
|
1282
|
+
f'{failure_threshold}s). Skipping.')
|
|
1221
1283
|
else:
|
|
1222
1284
|
initial_delay_seconds = self._get_initial_delay_seconds(
|
|
1223
1285
|
info.version)
|
|
@@ -1310,8 +1372,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1310
1372
|
# are not empty.
|
|
1311
1373
|
if new_config.get('file_mounts', None) != {}:
|
|
1312
1374
|
return
|
|
1313
|
-
for key in ['service']:
|
|
1314
|
-
new_config.pop(key)
|
|
1375
|
+
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1376
|
+
new_config.pop(key, None)
|
|
1377
|
+
new_config_any_of = new_config.get('resources', {}).pop('any_of', [])
|
|
1378
|
+
|
|
1315
1379
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
1316
1380
|
for info in replica_infos:
|
|
1317
1381
|
if info.version < version and not info.is_terminal:
|
|
@@ -1321,17 +1385,24 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1321
1385
|
self._service_name, info.version))
|
|
1322
1386
|
old_config = common_utils.read_yaml(
|
|
1323
1387
|
os.path.expanduser(old_service_task_yaml_path))
|
|
1324
|
-
for key in ['service']:
|
|
1325
|
-
old_config.pop(key)
|
|
1388
|
+
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1389
|
+
old_config.pop(key, None)
|
|
1326
1390
|
# Bump replica version if all fields except for service are
|
|
1327
1391
|
# the same.
|
|
1328
1392
|
# Here, we manually convert the any_of field to a set to avoid
|
|
1329
1393
|
# only the difference in the random order of the any_of fields.
|
|
1330
1394
|
old_config_any_of = old_config.get('resources',
|
|
1331
1395
|
{}).pop('any_of', [])
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1396
|
+
|
|
1397
|
+
def normalize_dict_list(lst):
|
|
1398
|
+
return collections.Counter(
|
|
1399
|
+
frozenset(d.items()) for d in lst)
|
|
1400
|
+
|
|
1401
|
+
if (normalize_dict_list(old_config_any_of) !=
|
|
1402
|
+
normalize_dict_list(new_config_any_of)):
|
|
1403
|
+
logger.info('Replica config changed (any_of), skipping. '
|
|
1404
|
+
f'old: {old_config_any_of}, '
|
|
1405
|
+
f'new: {new_config_any_of}')
|
|
1335
1406
|
continue
|
|
1336
1407
|
# File mounts should both be empty, as update always
|
|
1337
1408
|
# create new buckets if they are not empty.
|
|
@@ -1345,6 +1416,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1345
1416
|
info.version = version
|
|
1346
1417
|
serve_state.add_or_update_replica(self._service_name,
|
|
1347
1418
|
info.replica_id, info)
|
|
1419
|
+
else:
|
|
1420
|
+
logger.info('Replica config changed (rest), skipping. '
|
|
1421
|
+
f'old: {old_config}, '
|
|
1422
|
+
f'new: {new_config}')
|
|
1348
1423
|
|
|
1349
1424
|
def _get_version_spec(self, version: int) -> 'service_spec.SkyServiceSpec':
|
|
1350
1425
|
spec = serve_state.get_spec(self._service_name, version)
|
sky/serve/serve_state.py
CHANGED
|
@@ -68,6 +68,9 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
|
|
|
68
68
|
# Whether the service's load balancer is encrypted with TLS.
|
|
69
69
|
db_utils.add_column_to_table(cursor, conn, 'services', 'tls_encrypted',
|
|
70
70
|
'INTEGER DEFAULT 0')
|
|
71
|
+
# Whether the service is a cluster pool.
|
|
72
|
+
db_utils.add_column_to_table(cursor, conn, 'services', 'pool',
|
|
73
|
+
'INTEGER DEFAULT 0')
|
|
71
74
|
conn.commit()
|
|
72
75
|
|
|
73
76
|
|
|
@@ -269,7 +272,7 @@ _SERVICE_STATUS_TO_COLOR = {
|
|
|
269
272
|
@init_db
|
|
270
273
|
def add_service(name: str, controller_job_id: int, policy: str,
|
|
271
274
|
requested_resources_str: str, load_balancing_policy: str,
|
|
272
|
-
status: ServiceStatus, tls_encrypted: bool) -> bool:
|
|
275
|
+
status: ServiceStatus, tls_encrypted: bool, pool: bool) -> bool:
|
|
273
276
|
"""Add a service in the database.
|
|
274
277
|
|
|
275
278
|
Returns:
|
|
@@ -283,11 +286,12 @@ def add_service(name: str, controller_job_id: int, policy: str,
|
|
|
283
286
|
"""\
|
|
284
287
|
INSERT INTO services
|
|
285
288
|
(name, controller_job_id, status, policy,
|
|
286
|
-
requested_resources_str, load_balancing_policy, tls_encrypted
|
|
287
|
-
|
|
289
|
+
requested_resources_str, load_balancing_policy, tls_encrypted,
|
|
290
|
+
pool)
|
|
291
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
288
292
|
(name, controller_job_id, status.value, policy,
|
|
289
293
|
requested_resources_str, load_balancing_policy,
|
|
290
|
-
int(tls_encrypted)))
|
|
294
|
+
int(tls_encrypted), int(pool)))
|
|
291
295
|
|
|
292
296
|
except sqlite3.IntegrityError as e:
|
|
293
297
|
if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG:
|
|
@@ -364,8 +368,8 @@ def set_service_load_balancer_port(service_name: str,
|
|
|
364
368
|
def _get_service_from_row(row) -> Dict[str, Any]:
|
|
365
369
|
(current_version, name, controller_job_id, controller_port,
|
|
366
370
|
load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
|
|
367
|
-
_, active_versions, load_balancing_policy, tls_encrypted) = row[:
|
|
368
|
-
|
|
371
|
+
_, active_versions, load_balancing_policy, tls_encrypted, pool) = row[:16]
|
|
372
|
+
record = {
|
|
369
373
|
'name': name,
|
|
370
374
|
'controller_job_id': controller_job_id,
|
|
371
375
|
'controller_port': controller_port,
|
|
@@ -383,7 +387,13 @@ def _get_service_from_row(row) -> Dict[str, Any]:
|
|
|
383
387
|
'requested_resources_str': requested_resources_str,
|
|
384
388
|
'load_balancing_policy': load_balancing_policy,
|
|
385
389
|
'tls_encrypted': bool(tls_encrypted),
|
|
390
|
+
'pool': bool(pool),
|
|
386
391
|
}
|
|
392
|
+
latest_spec = get_spec(name, current_version)
|
|
393
|
+
if latest_spec is not None:
|
|
394
|
+
record['policy'] = latest_spec.autoscaling_policy_str()
|
|
395
|
+
record['load_balancing_policy'] = latest_spec.load_balancing_policy
|
|
396
|
+
return record
|
|
387
397
|
|
|
388
398
|
|
|
389
399
|
@init_db
|