PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +4 -1
sky/backends/cloud_vm_ray_backend.py +4 -3
sky/catalog/__init__.py +3 -3
sky/catalog/aws_catalog.py +12 -0
sky/catalog/common.py +2 -2
sky/catalog/data_fetchers/fetch_aws.py +13 -1
sky/client/cli/command.py +452 -53
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +21 -1
sky/data/storage.py +12 -0
sky/jobs/__init__.py +3 -0
sky/jobs/client/sdk.py +80 -3
sky/jobs/controller.py +76 -25
sky/jobs/recovery_strategy.py +80 -34
sky/jobs/scheduler.py +68 -20
sky/jobs/server/core.py +228 -136
sky/jobs/server/server.py +40 -0
sky/jobs/state.py +129 -24
sky/jobs/utils.py +109 -51
sky/provision/nebius/constants.py +3 -0
sky/provision/runpod/utils.py +27 -12
sky/py.typed +0 -0
sky/resources.py +16 -12
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/serve/autoscalers.py +8 -0
sky/serve/client/impl.py +188 -0
sky/serve/client/sdk.py +12 -82
sky/serve/constants.py +5 -1
sky/serve/controller.py +5 -0
sky/serve/replica_managers.py +112 -37
sky/serve/serve_state.py +16 -6
sky/serve/serve_utils.py +274 -77
sky/serve/server/core.py +8 -525
sky/serve/server/impl.py +709 -0
sky/serve/service.py +13 -9
sky/serve/service_spec.py +74 -4
sky/server/constants.py +1 -1
sky/server/daemons.py +164 -0
sky/server/requests/payloads.py +33 -0
sky/server/requests/requests.py +2 -107
sky/server/requests/serializers/decoders.py +12 -3
sky/server/requests/serializers/encoders.py +13 -2
sky/server/server.py +2 -1
sky/server/uvicorn.py +2 -1
sky/sky_logging.py +30 -0
sky/skylet/constants.py +2 -1
sky/skylet/events.py +9 -0
sky/skypilot_config.py +24 -21
sky/task.py +41 -11
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/sky-serve-controller.yaml.j2 +18 -2
sky/users/server.py +1 -1
sky/utils/command_runner.py +4 -2
sky/utils/controller_utils.py +14 -10
sky/utils/dag_utils.py +4 -2
sky/utils/db/migration_utils.py +2 -4
sky/utils/schemas.py +47 -19
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0

sky/serve/replica_managers.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """ReplicaManager: handles the creation and deletion of endpoint replicas."""
+import collections
 import dataclasses
 import enum
 import functools
@@ -23,6 +24,7 @@ from sky import execution
 from sky import global_user_state
 from sky import sky_logging
 from sky.backends import backend_utils
+from sky.jobs import scheduler as jobs_scheduler
 from sky.serve import constants as serve_constants
 from sky.serve import serve_state
 from sky.serve import serve_utils
@@ -34,6 +36,7 @@ from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import controller_utils
 from sky.utils import env_options
+from sky.utils import resources_utils
 from sky.utils import status_lib
 from sky.utils import ux_utils
@@ -45,8 +48,6 @@ logger = sky_logging.init_logger(__name__)
 _JOB_STATUS_FETCH_INTERVAL = 30
 _PROCESS_POOL_REFRESH_INTERVAL = 20
-# TODO(tian): Maybe let user determine this threshold
-_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180
 _RETRY_INIT_GAP_SECONDS = 60
 _DEFAULT_DRAIN_SECONDS = 120
@@ -180,6 +181,8 @@ def _get_resources_ports(service_task_yaml_path: str) -> str:
     # Already checked all ports are valid in sky.serve.core.up
     assert task.resources, task
     assert task.service is not None, task
+    if task.service.pool:
+        return '-'
     assert task.service.ports is not None, task
     return task.service.ports
@@ -445,8 +448,8 @@ class ReplicaInfo:
             return None
         replica_port_int = int(self.replica_port)
         try:
-            endpoint_dict = core.endpoints(handle.cluster_name,
-                                           replica_port_int)
+            endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
+                                                        replica_port_int)
         except exceptions.ClusterNotUpError:
             return None
         endpoint = endpoint_dict.get(replica_port_int, None)
@@ -466,7 +469,9 @@ class ReplicaInfo:
                          f'replica {self.replica_id}.')
         return replica_status
-    def to_info_dict(self, with_handle: bool) -> Dict[str, Any]:
+    def to_info_dict(self,
+                     with_handle: bool,
+                     with_url: bool = True) -> Dict[str, Any]:
         cluster_record = global_user_state.get_cluster_from_name(
             self.cluster_name)
         info_dict = {
@@ -474,18 +479,26 @@ class ReplicaInfo:
             'name': self.cluster_name,
             'status': self.status,
             'version': self.version,
-            'endpoint': self.url,
+            'endpoint': self.url if with_url else None,
             'is_spot': self.is_spot,
             'launched_at': (cluster_record['launched_at']
                             if cluster_record is not None else None),
         }
         if with_handle:
-            info_dict['handle'] = self.handle(cluster_record)
+            handle = self.handle(cluster_record)
+            info_dict['handle'] = handle
+            if handle is not None:
+                info_dict['cloud'] = repr(handle.launched_resources.cloud)
+                info_dict['region'] = handle.launched_resources.region
+                info_dict['resources_str'] = (
+                    resources_utils.get_readable_resources_repr(handle,
+                                                                simplify=True))
         return info_dict
     def __repr__(self) -> str:
-        info_dict = self.to_info_dict(
-            with_handle=env_options.Options.SHOW_DEBUG_INFO.get())
+        show_details = env_options.Options.SHOW_DEBUG_INFO.get()
+        info_dict = self.to_info_dict(with_handle=show_details,
+                                      with_url=show_details)
         handle_str = ''
         if 'handle' in info_dict:
             handle_str = f', handle={info_dict["handle"]}'
@@ -499,6 +512,33 @@ class ReplicaInfo:
                 f'launched_at={info_dict["launched_at"]}{handle_str})')
         return info
+    def probe_pool(self) -> Tuple['ReplicaInfo', bool, float]:
+        """Probe the replica for pool management.
+        This function will check the first job status of the cluster, which is a
+        dummy job that only echoes "setup done". The success of this job means
+        the setup command is done and the replica is ready to be used. Check
+        sky/serve/server/core.py::up for more details.
+        Returns:
+            Tuple of (self, is_ready, probe_time).
+        """
+        probe_time = time.time()
+        try:
+            handle = backend_utils.check_cluster_available(
+                self.cluster_name, operation='probing pool')
+            if handle is None:
+                return self, False, probe_time
+            backend = backend_utils.get_backend_from_handle(handle)
+            statuses = backend.get_job_status(handle, [1], stream_logs=False)
+            if statuses[1] == job_lib.JobStatus.SUCCEEDED:
+                return self, True, probe_time
+            return self, False, probe_time
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error(f'Error when probing pool of {self.cluster_name}: '
+                         f'{common_utils.format_exception(e)}.')
+            return self, False, probe_time
     def probe(
         self,
         readiness_path: str,
@@ -588,6 +628,7 @@ class ReplicaManager:
         self._service_name: str = service_name
         self._uptime: Optional[float] = None
         self._update_mode = serve_utils.DEFAULT_UPDATE_MODE
+        self._is_pool: bool = spec.pool
         header_keys = None
         if spec.readiness_headers is not None:
             header_keys = list(spec.readiness_headers.keys())
@@ -601,6 +642,15 @@ class ReplicaManager:
         # Oldest version among the currently provisioned and launched replicas
         self.least_recent_version: int = serve_constants.INITIAL_VERSION
+    def _consecutive_failure_threshold_timeout(self) -> int:
+        """The timeout for the consecutive failure threshold in seconds.
+        We reduce the timeout for pool to 10 seconds to make the pool more
+        responsive to the failure.
+        """
+        # TODO(tian): Maybe let user determine this threshold
+        return 10 if self._is_pool else 180
     def scale_up(self,
                  resources_override: Optional[Dict[str, Any]] = None) -> None:
         """Scale up the service by 1 replica with resources_override.
@@ -822,9 +872,8 @@ class SkyPilotReplicaManager(ReplicaManager):
             assert isinstance(handle, backends.CloudVmRayResourceHandle)
             replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
                                                 'replica_jobs')
-            job_log_file_name = (
-                controller_utils.download_and_stream_latest_job_log(
-                    backend, handle, replica_job_logs_dir))
+            job_log_file_name = (controller_utils.download_and_stream_job_log(
+                backend, handle, replica_job_logs_dir))
             if job_log_file_name is not None:
                 logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
                 with open(log_file_name, 'a',
@@ -937,6 +986,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                     self._service_name, replica_id)
                 assert info is not None, replica_id
                 error_in_sky_launch = False
+                schedule_next_jobs = False
                 if info.status == serve_state.ReplicaStatus.PENDING:
                     # sky.launch not started yet
                     if (serve_state.total_number_provisioning_replicas() <
@@ -965,6 +1015,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                     else:
                         info.status_property.sky_launch_status = (
                             ProcessStatus.SUCCEEDED)
+                        schedule_next_jobs = True
                     if self._spot_placer is not None and info.is_spot:
                         # TODO(tian): Currently, we set the location to
                         # preemptive if the launch process failed. This is
@@ -984,6 +1035,9 @@ class SkyPilotReplicaManager(ReplicaManager):
                             self._spot_placer.set_active(location)
                 serve_state.add_or_update_replica(self._service_name,
                                                   replica_id, info)
+                if schedule_next_jobs and self._is_pool:
+                    jobs_scheduler.maybe_schedule_next_jobs(
+                        pool=self._service_name)
                 if error_in_sky_launch:
                     # Teardown after update replica info since
                     # _terminate_replica will update the replica info too.
@@ -1100,9 +1154,10 @@ class SkyPilotReplicaManager(ReplicaManager):
             handle = info.handle()
             assert handle is not None, info
             # Use None to fetch latest job, which stands for user task job
+            job_ids = [1] if self._is_pool else None
             try:
                 job_statuses = backend.get_job_status(handle,
-                                                      None,
+                                                      job_ids,
                                                       stream_logs=False)
             except exceptions.CommandError:
                 # If the job status fetch failed, it is likely that the
@@ -1112,7 +1167,8 @@ class SkyPilotReplicaManager(ReplicaManager):
                     continue
                 # Re-raise the exception if it is not preempted.
                 raise
-            job_status = list(job_statuses.values())[0]
+            job_status = job_statuses[1] if self._is_pool else list(
+                job_statuses.values())[0]
             if job_status in job_lib.JobStatus.user_code_failure_states():
                 info.status_property.user_app_failed = True
                 serve_state.add_or_update_replica(self._service_name,
@@ -1156,18 +1212,24 @@ class SkyPilotReplicaManager(ReplicaManager):
             for info in infos:
                 if not info.status_property.should_track_service_status():
                     continue
-                replica_to_probe.append(
-                    f'replica_{info.replica_id}(url={info.url})')
-                probe_futures.append(
-                    pool.apply_async(
-                        info.probe,
-                        (
-                            self._get_readiness_path(info.version),
-                            self._get_post_data(info.version),
-                            self._get_readiness_timeout_seconds(info.version),
-                            self._get_readiness_headers(info.version),
-                        ),
-                    ),)
+                if self._is_pool:
+                    replica_to_probe.append(f'replica_{info.replica_id}(cluster'
+                                            f'_name={info.cluster_name})')
+                    probe_futures.append(pool.apply_async(info.probe_pool))
+                else:
+                    replica_to_probe.append(
+                        f'replica_{info.replica_id}(url={info.url})')
+                    probe_futures.append(
+                        pool.apply_async(
+                            info.probe,
+                            (
+                                self._get_readiness_path(info.version),
+                                self._get_post_data(info.version),
+                                self._get_readiness_timeout_seconds(
+                                    info.version),
+                                self._get_readiness_headers(info.version),
+                            ),
+                        ),)
             logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}')
             # Since futures.as_completed will return futures in the order of
@@ -1204,8 +1266,9 @@ class SkyPilotReplicaManager(ReplicaManager):
                         consecutive_failure_time = (
                             info.consecutive_failure_times[-1] -
                             info.consecutive_failure_times[0])
-                        if (consecutive_failure_time >=
-                                _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT):
+                        failure_threshold = (
+                            self._consecutive_failure_threshold_timeout())
+                        if consecutive_failure_time >= failure_threshold:
                             logger.info(
                                 f'Replica {info.replica_id} is not ready for '
                                 'too long and exceeding consecutive failure '
@@ -1216,8 +1279,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                                 f'Replica {info.replica_id} is not ready '
                                 'but within consecutive failure threshold '
                                 f'({consecutive_failure_time}s / '
-                                f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). '
-                                'Skipping.')
+                                f'{failure_threshold}s). Skipping.')
                     else:
                         initial_delay_seconds = self._get_initial_delay_seconds(
                             info.version)
@@ -1310,8 +1372,10 @@ class SkyPilotReplicaManager(ReplicaManager):
         # are not empty.
         if new_config.get('file_mounts', None) != {}:
             return
-        for key in ['service']:
-            new_config.pop(key)
+        for key in ['service', 'pool', '_user_specified_yaml']:
+            new_config.pop(key, None)
+        new_config_any_of = new_config.get('resources', {}).pop('any_of', [])
         replica_infos = serve_state.get_replica_infos(self._service_name)
         for info in replica_infos:
             if info.version < version and not info.is_terminal:
@@ -1321,17 +1385,24 @@ class SkyPilotReplicaManager(ReplicaManager):
                         self._service_name, info.version))
                 old_config = common_utils.read_yaml(
                     os.path.expanduser(old_service_task_yaml_path))
-                for key in ['service']:
-                    old_config.pop(key)
+                for key in ['service', 'pool', '_user_specified_yaml']:
+                    old_config.pop(key, None)
                 # Bump replica version if all fields except for service are
                 # the same.
                 # Here, we manually convert the any_of field to a set to avoid
                 # only the difference in the random order of the any_of fields.
                 old_config_any_of = old_config.get('resources',
                                                    {}).pop('any_of', [])
-                new_config_any_of = new_config.get('resources',
-                                                   {}).pop('any_of', [])
-                if set(old_config_any_of) != set(new_config_any_of):
+                def normalize_dict_list(lst):
+                    return collections.Counter(
+                        frozenset(d.items()) for d in lst)
+                if (normalize_dict_list(old_config_any_of) !=
+                        normalize_dict_list(new_config_any_of)):
+                    logger.info('Replica config changed (any_of), skipping. '
+                                f'old: {old_config_any_of}, '
+                                f'new: {new_config_any_of}')
                     continue
                 # File mounts should both be empty, as update always
                 # create new buckets if they are not empty.
@@ -1345,6 +1416,10 @@ class SkyPilotReplicaManager(ReplicaManager):
                     info.version = version
                     serve_state.add_or_update_replica(self._service_name,
                                                       info.replica_id, info)
+                else:
+                    logger.info('Replica config changed (rest), skipping. '
+                                f'old: {old_config}, '
+                                f'new: {new_config}')
     def _get_version_spec(self, version: int) -> 'service_spec.SkyServiceSpec':
         spec = serve_state.get_spec(self._service_name, version)

sky/serve/serve_state.py CHANGED Viewed

@@ -68,6 +68,9 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
     # Whether the service's load balancer is encrypted with TLS.
     db_utils.add_column_to_table(cursor, conn, 'services', 'tls_encrypted',
                                  'INTEGER DEFAULT 0')
+    # Whether the service is a cluster pool.
+    db_utils.add_column_to_table(cursor, conn, 'services', 'pool',
+                                 'INTEGER DEFAULT 0')
     conn.commit()
@@ -269,7 +272,7 @@ _SERVICE_STATUS_TO_COLOR = {
 @init_db
 def add_service(name: str, controller_job_id: int, policy: str,
                 requested_resources_str: str, load_balancing_policy: str,
-                status: ServiceStatus, tls_encrypted: bool) -> bool:
+                status: ServiceStatus, tls_encrypted: bool, pool: bool) -> bool:
     """Add a service in the database.
     Returns:
@@ -283,11 +286,12 @@ def add_service(name: str, controller_job_id: int, policy: str,
                 """\
                 INSERT INTO services
                 (name, controller_job_id, status, policy,
-                requested_resources_str, load_balancing_policy, tls_encrypted)
-                VALUES (?, ?, ?, ?, ?, ?, ?)""",
+                requested_resources_str, load_balancing_policy, tls_encrypted,
+                pool)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
                 (name, controller_job_id, status.value, policy,
                  requested_resources_str, load_balancing_policy,
-                 int(tls_encrypted)))
+                 int(tls_encrypted), int(pool)))
     except sqlite3.IntegrityError as e:
         if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG:
@@ -364,8 +368,8 @@ def set_service_load_balancer_port(service_name: str,
 def _get_service_from_row(row) -> Dict[str, Any]:
     (current_version, name, controller_job_id, controller_port,
      load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
-     _, active_versions, load_balancing_policy, tls_encrypted) = row[:15]
-    return {
+     _, active_versions, load_balancing_policy, tls_encrypted, pool) = row[:16]
+    record = {
         'name': name,
         'controller_job_id': controller_job_id,
         'controller_port': controller_port,
@@ -383,7 +387,13 @@ def _get_service_from_row(row) -> Dict[str, Any]:
         'requested_resources_str': requested_resources_str,
         'load_balancing_policy': load_balancing_policy,
         'tls_encrypted': bool(tls_encrypted),
+        'pool': bool(pool),
     }
+    latest_spec = get_spec(name, current_version)
+    if latest_spec is not None:
+        record['policy'] = latest_spec.autoscaling_policy_str()
+        record['load_balancing_policy'] = latest_spec.load_balancing_policy
+    return record
 @init_db

skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250801py3-none-any.whl