PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250817__py3-none-any.whl → 1.0.0.dev20250819__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250817py3-none-any.whl → 1.0.0.dev20250819py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (49) hide show

sky/jobs/server/core.py CHANGED Viewed

@@ -514,7 +514,7 @@ def queue_from_kubernetes_pod(
     except exceptions.CommandError as e:
         raise RuntimeError(str(e)) from e
-    jobs, _, result_type = managed_job_utils.load_managed_job_queue(
+    jobs, _, result_type, _, _ = managed_job_utils.load_managed_job_queue(
         job_table_payload)
     if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
@@ -587,31 +587,36 @@ def queue(
     pool_match: Optional[str] = None,
     page: Optional[int] = None,
     limit: Optional[int] = None,
-) -> Tuple[List[Dict[str, Any]], int]:
+    statuses: Optional[List[str]] = None,
+) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Gets statuses of managed jobs.
     Please refer to sky.cli.job_queue for documentation.
     Returns:
-        [
-            {
-                'job_id': int,
-                'job_name': str,
-                'resources': str,
-                'submitted_at': (float) timestamp of submission,
-                'end_at': (float) timestamp of end,
-                'job_duration': (float) duration in seconds,
-                'recovery_count': (int) Number of retries,
-                'status': (sky.jobs.ManagedJobStatus) of the job,
-                'cluster_resources': (str) resources of the cluster,
-                'region': (str) region of the cluster,
-                'user_name': (Optional[str]) job creator's user name,
-                'user_hash': (str) job creator's user hash,
-                'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
-                'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
-            }
-        ]
+        jobs: List[Dict[str, Any]]
+            [
+                {
+                    'job_id': int,
+                    'job_name': str,
+                    'resources': str,
+                    'submitted_at': (float) timestamp of submission,
+                    'end_at': (float) timestamp of end,
+                    'job_duration': (float) duration in seconds,
+                    'recovery_count': (int) Number of retries,
+                    'status': (sky.jobs.ManagedJobStatus) of the job,
+                    'cluster_resources': (str) resources of the cluster,
+                    'region': (str) region of the cluster,
+                    'user_name': (Optional[str]) job creator's user name,
+                    'user_hash': (str) job creator's user hash,
+                    'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
+                    'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
+                }
+            ]
+        total: int, total number of jobs after filter
+        status_counts: Dict[str, int], status counts after filter
+        total_no_filter: int, total number of jobs before filter
     Raises:
         sky.exceptions.ClusterNotUpError: the jobs controller is not up or
             does not exist.
@@ -645,13 +650,13 @@ def queue(
     elif user_match is not None:
         users = global_user_state.get_user_by_name_match(user_match)
         if not users:
-            return [], 0
+            return [], 0, {}, 0
         user_hashes = [user.id for user in users]
     accessible_workspaces = list(workspaces_core.get_workspaces().keys())
     code = managed_job_utils.ManagedJobCodeGen.get_job_table(
         skip_finished, accessible_workspaces, job_ids, workspace_match,
-        name_match, pool_match, page, limit, user_hashes)
+        name_match, pool_match, page, limit, user_hashes, statuses)
     returncode, job_table_payload, stderr = backend.run_on_head(
         handle,
         code,
@@ -664,11 +669,11 @@ def queue(
         raise RuntimeError('Failed to fetch managed jobs with returncode: '
                            f'{returncode}.\n{job_table_payload + stderr}')
-    jobs, total, result_type = managed_job_utils.load_managed_job_queue(
-        job_table_payload)
+    (jobs, total, result_type, total_no_filter, status_counts
+    ) = managed_job_utils.load_managed_job_queue(job_table_payload)
     if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
-        return jobs, total
+        return jobs, total, status_counts, total_no_filter
     # Backward compatibility for old jobs controller without filtering
     # TODO(hailong): remove this after 0.12.0
@@ -702,14 +707,18 @@ def queue(
     if job_ids:
         jobs = [job for job in jobs if job['job_id'] in job_ids]
-    return managed_job_utils.filter_jobs(jobs,
-                                         workspace_match,
-                                         name_match,
-                                         pool_match,
-                                         page=page,
-                                         limit=limit,
-                                         user_match=user_match,
-                                         enable_user_match=True)
+    filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
+        jobs,
+        workspace_match,
+        name_match,
+        pool_match,
+        page=page,
+        limit=limit,
+        user_match=user_match,
+        enable_user_match=True,
+        statuses=statuses,
+    )
+    return filtered_jobs, total, status_counts, total_no_filter
 @usage_lib.entrypoint

sky/jobs/server/utils.py CHANGED Viewed

@@ -62,7 +62,8 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
     version_matches = controller_version == local_version
     # Load and filter jobs locally using existing method
-    jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
+    jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
+        job_table_payload)
     non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
     has_non_terminal_jobs = len(non_terminal_jobs) > 0

sky/jobs/utils.py CHANGED Viewed

@@ -768,6 +768,13 @@ def stream_logs_by_id(job_id: int,
                             assert tail > 0
                             # Read only the last 'tail' lines using deque
                             read_from = collections.deque(f, maxlen=tail)
+                            # We set start_streaming to True here in case
+                            # truncating the log file removes the line that
+                            # contains LOG_FILE_START_STREAMING_AT. This does
+                            # not cause issues for log files shorter than tail
+                            # because tail_logs in sky/skylet/log_lib.py also
+                            # handles LOG_FILE_START_STREAMING_AT.
+                            start_streaming = True
                         for line in read_from:
                             if log_lib.LOG_FILE_START_STREAMING_AT in line:
                                 start_streaming = True
@@ -1133,6 +1140,7 @@ def dump_managed_job_queue(
     page: Optional[int] = None,
     limit: Optional[int] = None,
     user_hashes: Optional[List[Optional[str]]] = None,
+    statuses: Optional[List[str]] = None,
 ) -> str:
     # Make sure to get all jobs - some logic below (e.g. high priority job
     # detection) requires a full view of the jobs table.
@@ -1160,6 +1168,8 @@ def dump_managed_job_queue(
         if priority is not None and priority > highest_blocking_priority:
             highest_blocking_priority = priority
+    total_no_filter = len(jobs)
     if user_hashes:
         jobs = [
             job for job in jobs if job.get('user_hash', None) in user_hashes
@@ -1183,8 +1193,13 @@ def dump_managed_job_queue(
     if job_ids:
         jobs = [job for job in jobs if job['job_id'] in job_ids]
-    jobs, total = filter_jobs(jobs, workspace_match, name_match, pool_match,
-                              page, limit)
+    jobs, total, status_counts = filter_jobs(jobs,
+                                             workspace_match,
+                                             name_match,
+                                             pool_match,
+                                             page,
+                                             limit,
+                                             statuses=statuses)
     for job in jobs:
         end_at = job['end_at']
         if end_at is None:
@@ -1258,7 +1273,12 @@ def dump_managed_job_queue(
         else:
             job['details'] = None
-    return message_utils.encode_payload({'jobs': jobs, 'total': total})
+    return message_utils.encode_payload({
+        'jobs': jobs,
+        'total': total,
+        'total_no_filter': total_no_filter,
+        'status_counts': status_counts
+    })
 def filter_jobs(
@@ -1270,7 +1290,8 @@ def filter_jobs(
     limit: Optional[int],
     user_match: Optional[str] = None,
     enable_user_match: bool = False,
-) -> Tuple[List[Dict[str, Any]], int]:
+    statuses: Optional[List[str]] = None,
+) -> Tuple[List[Dict[str, Any]], int, Dict[str, int]]:
     """Filter jobs based on the given criteria.
     Args:
@@ -1282,9 +1303,12 @@ def filter_jobs(
         limit: Limit to filter.
         user_match: User name to filter.
         enable_user_match: Whether to enable user match.
+        statuses: Statuses to filter.
     Returns:
-        List of filtered jobs and total number of jobs.
+        List of filtered jobs
+        Total number of jobs
+        Dictionary of status counts
     """
     # TODO(hailong): refactor the whole function including the
@@ -1314,6 +1338,7 @@ def filter_jobs(
         end = min(start + limit, len(result))
         return result[start:end]
+    status_counts: Dict[str, int] = collections.defaultdict(int)
     result = []
     checks = [
         ('workspace', workspace_match),
@@ -1327,25 +1352,34 @@ def filter_jobs(
         if not all(
                 _pattern_matches(job, key, pattern) for key, pattern in checks):
             continue
+        status_counts[job['status'].value] += 1
+        if statuses:
+            if job['status'].value not in statuses:
+                continue
         result.append(job)
     total = len(result)
-    return _handle_page_and_limit(result, page, limit), total
+    return _handle_page_and_limit(result, page, limit), total, status_counts
 def load_managed_job_queue(
     payload: str
-) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType]:
+) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType, int, Dict[
+        str, int]]:
     """Load job queue from json string."""
     result = message_utils.decode_payload(payload)
     result_type = ManagedJobQueueResultType.DICT
+    status_counts = {}
     if isinstance(result, dict):
         jobs = result['jobs']
         total = result['total']
+        status_counts = result.get('status_counts', {})
+        total_no_filter = result.get('total_no_filter', total)
     else:
         jobs = result
         total = len(jobs)
+        total_no_filter = total
         result_type = ManagedJobQueueResultType.LIST
     for job in jobs:
@@ -1355,7 +1389,7 @@ def load_managed_job_queue(
             # TODO(cooperc): Remove check before 0.12.0.
             user = global_user_state.get_user(job['user_hash'])
             job['user_name'] = user.name if user is not None else None
-    return jobs, total, result_type
+    return jobs, total, result_type, total_no_filter, status_counts
 def _get_job_status_from_tasks(
@@ -1713,6 +1747,7 @@ class ManagedJobCodeGen:
         page: Optional[int] = None,
         limit: Optional[int] = None,
         user_hashes: Optional[List[Optional[str]]] = None,
+        statuses: Optional[List[str]] = None,
     ) -> str:
         code = textwrap.dedent(f"""\
         if managed_job_version < 9:
@@ -1720,7 +1755,7 @@ class ManagedJobCodeGen:
             # before #6652.
             # TODO(hailong): Remove compatibility before 0.12.0
             job_table = utils.dump_managed_job_queue()
-        else:
+        elif managed_job_version < 10:
             job_table = utils.dump_managed_job_queue(
                                 skip_finished={skip_finished},
                                 accessible_workspaces={accessible_workspaces!r},
@@ -1731,6 +1766,18 @@ class ManagedJobCodeGen:
                                 page={page!r},
                                 limit={limit!r},
                                 user_hashes={user_hashes!r})
+        else:
+            job_table = utils.dump_managed_job_queue(
+                                skip_finished={skip_finished},
+                                accessible_workspaces={accessible_workspaces!r},
+                                job_ids={job_ids!r},
+                                workspace_match={workspace_match!r},
+                                name_match={name_match!r},
+                                pool_match={pool_match!r},
+                                page={page!r},
+                                limit={limit!r},
+                                user_hashes={user_hashes!r},
+                                statuses={statuses!r})
         print(job_table, flush=True)
         """)
         return cls._build(code)

sky/provision/provisioner.py CHANGED Viewed

@@ -167,7 +167,7 @@ def bulk_provision(
             # This error is a user error instead of a provisioning failure.
             # And there is no possibility to fix it by teardown.
             raise
-        except Exception:  # pylint: disable=broad-except
+        except Exception as exc:  # pylint: disable=broad-except
             zone_str = 'all zones'
             if zones:
                 zone_str = ','.join(zone.name for zone in zones)
@@ -189,14 +189,18 @@ def bulk_provision(
                         provider_config=original_config['provider'])
                     break
                 except NotImplementedError as e:
-                    verb = 'terminate' if terminate else 'stop'
+                    assert not terminate, (
+                        'Terminating must be supported by all clouds')
+                    exc_msg = common_utils.format_exception(exc).replace(
+                        '\n', ' ')
                     # If the underlying cloud does not support stopping
                     # instances, we should stop failover as well.
                     raise provision_common.StopFailoverError(
-                        'During provisioner\'s failover, '
-                        f'{terminate_str.lower()} {cluster_name!r} failed. '
-                        f'We cannot {verb} the resources launched, as it is '
-                        f'not supported by {cloud}. Please try launching the '
+                        f'Provisioning cluster {cluster_name.display_name} '
+                        f'failed: {exc_msg}. Failover is stopped for safety '
+                        'because the cluster was previously in UP state but '
+                        f'{cloud} does not support stopping instances to '
+                        'preserve the cluster state. Please try launching the '
                         'cluster again, or terminate it with: '
                         f'sky down {cluster_name.display_name}') from e
                 except Exception as e:  # pylint: disable=broad-except

sky/server/common.py CHANGED Viewed

@@ -561,15 +561,13 @@ def _start_api_server(deploy: bool = False,
         # For spawn mode, copy the environ to avoid polluting the SDK process.
         server_env = os.environ.copy()
         server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
-        _set_metrics_env_var(server_env, metrics, deploy)
         # Start the API server process in the background and don't wait for it.
         # If this is called from a CLI invocation, we need
         # start_new_session=True so that SIGINT on the CLI will not also kill
         # the API server.
-        server_env = os.environ.copy()
-        server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
         if enable_basic_auth:
             server_env[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
+        _set_metrics_env_var(server_env, metrics, deploy)
         with open(log_path, 'w', encoding='utf-8') as log_file:
             # Because the log file is opened using a with statement, it may seem
             # that the file will be closed when the with statement is exited
@@ -643,7 +641,7 @@ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
         deploy: Whether the server is running in deploy mode, which means
             multiple processes might be running.
     """
-    if metrics:
+    if metrics or os.getenv(constants.ENV_VAR_SERVER_METRICS_ENABLED) == 'true':
         env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
         if deploy:
             metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')

sky/server/requests/payloads.py CHANGED Viewed

@@ -503,6 +503,7 @@ class JobsQueueBody(RequestBody):
     pool_match: Optional[str] = None
     page: Optional[int] = None
     limit: Optional[int] = None
+    statuses: Optional[List[str]] = None
 class JobsCancelBody(RequestBody):

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -113,8 +113,15 @@ def encode_status_kubernetes(
 @register_encoder('jobs.queue')
 def encode_jobs_queue(jobs_or_tuple):
     # Support returning either a plain jobs list or a (jobs, total) tuple
-    if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
-        jobs, total = jobs_or_tuple
+    status_counts = {}
+    if isinstance(jobs_or_tuple, tuple):
+        if len(jobs_or_tuple) == 2:
+            jobs, total = jobs_or_tuple
+            total_no_filter = total
+        elif len(jobs_or_tuple) == 4:
+            jobs, total, status_counts, total_no_filter = jobs_or_tuple
+        else:
+            raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
     else:
         jobs = jobs_or_tuple
         total = None
@@ -122,7 +129,12 @@ def encode_jobs_queue(jobs_or_tuple):
         job['status'] = job['status'].value
     if total is None:
         return jobs
-    return {'jobs': jobs, 'total': total}
+    return {
+        'jobs': jobs,
+        'total': total,
+        'total_no_filter': total_no_filter,
+        'status_counts': status_counts
+    }
 def _encode_serve_status(

sky/server/server.py CHANGED Viewed

@@ -1650,7 +1650,10 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
     await websocket.accept()
     logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
-    cluster_records = core.status(cluster_name, all_users=True)
+    # Run core.status in another thread to avoid blocking the event loop.
+    cluster_records = await context_utils.to_thread(core.status,
+                                                    cluster_name,
+                                                    all_users=True)
     cluster_record = cluster_records[0]
     if cluster_record['status'] != status_lib.ClusterStatus.UP:
         raise fastapi.HTTPException(

sky/setup_files/MANIFEST.in CHANGED Viewed

@@ -9,6 +9,7 @@ include sky/skylet/providers/ibm/*
 include sky/skylet/providers/scp/*
 include sky/skylet/providers/*.py
 include sky/skylet/ray_patches/*.patch
+include sky/skylet/ray_patches/*.diff
 include sky/jobs/dashboard/*
 include sky/jobs/dashboard/templates/*
 include sky/jobs/dashboard/static/*

sky/skylet/ray_patches/__init__.py CHANGED Viewed

@@ -40,15 +40,29 @@ def _run_patch(target_file,
     """Applies a patch if it has not been applied already."""
     # .orig is the original file that is not patched.
     orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
+    # Get diff filename by replacing .patch with .diff
+    diff_file = patch_file.replace('.patch', '.diff')
     script = f"""\
     which patch >/dev/null 2>&1 || sudo yum install -y patch || true
-    which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
     if [ ! -f {orig_file} ]; then
         echo Create backup file {orig_file}
         cp {target_file} {orig_file}
     fi
-    # It is ok to patch again from the original file.
-    patch {orig_file} -i {patch_file} -o {target_file}
+    if which patch >/dev/null 2>&1; then
+        # System patch command is available, use it
+        # It is ok to patch again from the original file.
+        patch {orig_file} -i {patch_file} -o {target_file}
+    else
+        # System patch command not available, use Python patch library
+        echo "System patch command not available, using Python patch library..."
+        python -m pip install patch
+        # Get target directory
+        target_dir="$(dirname {target_file})"
+        # Execute python patch command
+        echo "Executing python -m patch -d $target_dir {diff_file}"
+        python -m patch -d "$target_dir" "{diff_file}"
+    fi
     """
     subprocess.run(script, shell=True, check=True)

sky/skylet/ray_patches/autoscaler.py.diff ADDED Viewed

@@ -0,0 +1,18 @@
+--- a/autoscaler.py
++++ b/autoscaler.py
+@@ -1,3 +1,6 @@
++# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
++# Sky patch changes:
++#  - enable upscaling_speed to be 0.0
+ import copy
+ import logging
+ import math
+@@ -1071,7 +1074,7 @@
+             upscaling_speed = self.config.get("upscaling_speed")
+             aggressive = self.config.get("autoscaling_mode") == "aggressive"
+             target_utilization_fraction = self.config.get("target_utilization_fraction")
+-            if upscaling_speed:
++            if upscaling_speed is not None: # NOTE(sky): enable 0.0
+                 upscaling_speed = float(upscaling_speed)
+             # TODO(ameer): consider adding (if users ask) an option of
+             # initial_upscaling_num_workers.

sky/skylet/ray_patches/cli.py.diff ADDED Viewed

@@ -0,0 +1,19 @@
+--- a/cli.py
++++ b/cli.py
+@@ -1,3 +1,7 @@
++# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
++# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
++# Otherwise, the output redirection ">" will not work.
++
+ import json
+ import os
+ import sys
+@@ -270,7 +274,7 @@
+         working_dir=working_dir,
+     )
+     job_id = client.submit_job(
+-        entrypoint=list2cmdline(entrypoint),
++        entrypoint=" ".join(entrypoint),
+         submission_id=submission_id,
+         runtime_env=final_runtime_env,
+         metadata=metadata_json,

sky/skylet/ray_patches/command_runner.py.diff ADDED Viewed

@@ -0,0 +1,17 @@
+--- a/command_runner.py
++++ b/command_runner.py
+@@ -1,3 +1,5 @@
++# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
++
+ import hashlib
+ import json
+ import logging
+@@ -137,7 +139,7 @@
+                 {
+                     "ControlMaster": "auto",
+                     "ControlPath": "{}/%C".format(control_path),
+-                    "ControlPersist": "10s",
++                    "ControlPersist": "300s",
+                 }
+             )
+         self.arg_dict.update(kwargs)

sky/skylet/ray_patches/log_monitor.py.diff ADDED Viewed

@@ -0,0 +1,20 @@
+--- a/log_monitor.py
++++ b/log_monitor.py
+@@ -1,3 +1,7 @@
++# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
++# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
++# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
++
+ import argparse
+ import errno
+ import glob
+@@ -374,7 +378,8 @@
+                     next_line = next_line.decode("utf-8", "replace")
+                     if next_line == "":
+                         break
+-                    next_line = next_line.rstrip("\r\n")
++                    if next_line.endswith("\n"):
++                        next_line = next_line[:-1]
+                     if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
+                         flush()  # Possible change of task/actor name.

sky/skylet/ray_patches/resource_demand_scheduler.py.diff ADDED Viewed

@@ -0,0 +1,32 @@
+--- a/resource_demand_scheduler.py
++++ b/resource_demand_scheduler.py
+@@ -1,3 +1,8 @@
++# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py
++# Sky patch changes:
++#  - no new nodes are allowed to be launched launched when the upscaling_speed is 0
++#  - comment out "assert not unfulfilled": this seems a buggy assert
++
+ """Implements multi-node-type autoscaling.
+ This file implements an autoscaling algorithm that is aware of multiple node
+@@ -448,7 +453,10 @@
+                 + placement_group_nodes.get(node_type, 0),
+             )
+-            if upper_bound > 0:
++            # NOTE(sky): do not autoscale when upsclaing speed is 0.
++            if self.upscaling_speed == 0:
++                upper_bound = 0
++            if upper_bound >= 0:
+                 updated_nodes_to_launch[node_type] = min(
+                     upper_bound, to_launch[node_type]
+                 )
+@@ -592,7 +600,7 @@
+             unfulfilled, including_reserved = get_bin_pack_residual(
+                 new_node_resources, unfulfilled, strict_spread=True
+             )
+-            assert not unfulfilled
++            # assert not unfulfilled  # NOTE(sky): buggy assert.
+             node_resources += including_reserved
+         return to_add, node_resources, node_type_counts

sky/skylet/ray_patches/updater.py.diff ADDED Viewed

@@ -0,0 +1,18 @@
+--- a/updater.py
++++ b/updater.py
+@@ -1,3 +1,7 @@
++# From https://github.com/ray-project/ray/blob/releases/2.9.3/python/ray/autoscaler/_private/updater.py
++# Sky patch changes:
++#  - Ensure the node state is refreshed before checking the node is terminated.
++
+ import logging
+ import os
+ import subprocess
+@@ -325,6 +329,7 @@
+                         )
+                         time.sleep(READY_CHECK_INTERVAL)
++                    self.provider.non_terminated_nodes({})
+     def do_update(self):
+         self.provider.set_node_tags(

sky/skylet/ray_patches/worker.py.diff ADDED Viewed

@@ -0,0 +1,41 @@
+--- a/worker.py
++++ b/worker.py
+@@ -1,3 +1,7 @@
++# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py
++# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233
++# Tracked in PR https://github.com/ray-project/ray/pull/21977/files.
++
+ import atexit
+ import faulthandler
+ import functools
+@@ -2020,6 +2024,14 @@
+         pid = data.get("pid")
+         lines = data.get("lines", [])
++    def end_for(line: str) -> str:
++        if sys.platform == "win32":
++            return "\n"
++        if line.endswith("\r"):
++            return ""
++        return "\n"
++
++
+     if data.get("ip") == data.get("localhost"):
+         for line in lines:
+             if RAY_TQDM_MAGIC in line:
+@@ -2035,6 +2047,7 @@
+                         message_for(data, line),
+                     ),
+                     file=print_file,
++                    end=end_for(line),
+                 )
+     else:
+         for line in lines:
+@@ -2052,6 +2065,7 @@
+                         message_for(data, line),
+                     ),
+                     file=print_file,
++                    end=end_for(line),
+                 )
+     # Restore once at end of batch to avoid excess hiding/unhiding of tqdm.
+     restore_tqdm()

{skypilot_nightly-1.0.0.dev20250817.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: skypilot-nightly
-Version: 1.0.0.dev20250817
+Version: 1.0.0.dev20250819
 Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
 Author: SkyPilot Team
 License: Apache 2.0

skypilot-nightly 1.0.0.dev20250817__py3-none-any.whl → 1.0.0.dev20250819__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250817py3-none-any.whl → 1.0.0.dev20250819py3-none-any.whl