PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250225__py3-none-any.whl → 1.0.0.dev20250227__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250225py3-none-any.whl → 1.0.0.dev20250227py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

sky/__init__.py +2 -2
sky/adaptors/kubernetes.py +4 -6
sky/cli.py +51 -17
sky/client/cli.py +51 -17
sky/jobs/client/sdk.py +10 -4
sky/jobs/constants.py +1 -1
sky/jobs/dashboard/dashboard.py +4 -1
sky/jobs/scheduler.py +23 -8
sky/jobs/server/core.py +33 -9
sky/jobs/server/server.py +9 -0
sky/jobs/state.py +30 -10
sky/jobs/utils.py +57 -12
sky/server/constants.py +1 -1
sky/server/requests/payloads.py +2 -0
sky/templates/jobs-controller.yaml.j2 +8 -1
sky/utils/common_utils.py +94 -14
{skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/METADATA +3 -2
{skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/RECORD +22 -22
{skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/top_level.txt +0 -0

sky/jobs/server/core.py CHANGED Viewed

@@ -140,6 +140,7 @@ def launch(
         prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
         remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml'
         remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml'
+        remote_env_file_path = f'{prefix}/{dag.name}-{dag_uuid}.env'
         controller_resources = controller_utils.get_controller_resources(
             controller=controller_utils.Controllers.JOBS_CONTROLLER,
             task_resources=sum([list(t.resources) for t in dag.tasks], []))
@@ -152,6 +153,7 @@ def launch(
             # Note: actual cluster name will be <task.name>-<managed job ID>
             'dag_name': dag.name,
             'remote_user_config_path': remote_user_config_path,
+            'remote_env_file_path': remote_env_file_path,
             'modified_catalogs':
                 service_catalog_common.get_modified_catalog_file_mounts(),
             'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
@@ -318,7 +320,9 @@ def _maybe_restart_controller(
 @usage_lib.entrypoint
-def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
+def queue(refresh: bool,
+          skip_finished: bool = False,
+          all_users: bool = False) -> List[Dict[str, Any]]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Gets statuses of managed jobs.
@@ -366,6 +370,19 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
                            f'{returncode}')
     jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    if not all_users:
+        def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
+            user_hash = job.get('user_hash', None)
+            if user_hash is None:
+                # For backwards compatibility, we show jobs that do not have a
+                # user_hash. TODO(cooperc): Remove before 0.12.0.
+                return True
+            return user_hash == common_utils.get_user_hash()
+        jobs = list(filter(user_hash_matches_or_missing, jobs))
     if skip_finished:
         # Filter out the finished jobs. If a multi-task job is partially
         # finished, we will include all its tasks.
@@ -374,6 +391,7 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
         non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
         jobs = list(
             filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
     return jobs
@@ -381,7 +399,8 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
 # pylint: disable=redefined-builtin
 def cancel(name: Optional[str] = None,
            job_ids: Optional[List[int]] = None,
-           all: bool = False) -> None:
+           all: bool = False,
+           all_users: bool = False) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Cancels managed jobs.
@@ -397,17 +416,22 @@ def cancel(name: Optional[str] = None,
         stopped_message='All managed jobs should have finished.')
     job_id_str = ','.join(map(str, job_ids))
-    if sum([bool(job_ids), name is not None, all]) != 1:
-        argument_str = f'job_ids={job_id_str}' if job_ids else ''
-        argument_str += f' name={name}' if name is not None else ''
-        argument_str += ' all' if all else ''
+    if sum([bool(job_ids), name is not None, all or all_users]) != 1:
+        arguments = []
+        arguments += [f'job_ids={job_id_str}'] if job_ids else []
+        arguments += [f'name={name}'] if name is not None else []
+        arguments += ['all'] if all else []
+        arguments += ['all_users'] if all_users else []
         with ux_utils.print_exception_no_traceback():
-            raise ValueError('Can only specify one of JOB_IDS or name or all. '
-                             f'Provided {argument_str!r}.')
+            raise ValueError('Can only specify one of JOB_IDS, name, or all/'
+                             f'all_users. Provided {" ".join(arguments)!r}.')
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend)
-    if all:
+    if all_users:
+        code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
+            None, all_users=True)
+    elif all:
         code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
     elif job_ids:
         code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(job_ids)

sky/jobs/server/server.py CHANGED Viewed

@@ -109,9 +109,18 @@ async def download_logs(
 @router.get('/dashboard')
 async def dashboard(request: fastapi.Request,
                     user_hash: str) -> fastapi.Response:
+    # TODO(cooperc): Support showing only jobs for a specific user.
+    # FIX(zhwu/cooperc/eric): Fix log downloading (assumes global
+    # /download_log/xx route)
     # Note: before #4717, each user had their own controller, and thus their own
     # dashboard. Now, all users share the same controller, so this isn't really
     # necessary. TODO(cooperc): clean up.
+    # TODO: Put this in an executor to avoid blocking the main server thread.
+    # It can take a long time if it needs to check the controller status.
     # Find the port for the dashboard of the user
     os.environ[constants.USER_ID_ENV_VAR] = user_hash
     server_common.reload_for_new_request(client_entrypoint=None,

sky/jobs/state.py CHANGED Viewed

@@ -116,7 +116,9 @@ def create_table(cursor, conn):
         name TEXT,
         schedule_state TEXT,
         controller_pid INTEGER DEFAULT NULL,
-        dag_yaml_path TEXT)""")
+        dag_yaml_path TEXT,
+        env_file_path TEXT,
+        user_hash TEXT)""")
     db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
                                  'TEXT')
@@ -127,6 +129,11 @@ def create_table(cursor, conn):
     db_utils.add_column_to_table(cursor, conn, 'job_info', 'dag_yaml_path',
                                  'TEXT')
+    db_utils.add_column_to_table(cursor, conn, 'job_info', 'env_file_path',
+                                 'TEXT')
+    db_utils.add_column_to_table(cursor, conn, 'job_info', 'user_hash', 'TEXT')
     conn.commit()
@@ -181,6 +188,8 @@ columns = [
     'schedule_state',
     'controller_pid',
     'dag_yaml_path',
+    'env_file_path',
+    'user_hash',
 ]
@@ -683,20 +692,24 @@ def set_local_log_file(job_id: int, task_id: Optional[int],
 # ======== utility functions ========
-def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
+def get_nonterminal_job_ids_by_name(name: Optional[str],
+                                    all_users: bool = False) -> List[int]:
     """Get non-terminal job ids by name."""
     statuses = ', '.join(['?'] * len(ManagedJobStatus.terminal_statuses()))
     field_values = [
         status.value for status in ManagedJobStatus.terminal_statuses()
     ]
-    name_filter = ''
+    job_filter = ''
+    if name is None and not all_users:
+        job_filter += 'AND (job_info.user_hash=(?)) '
+        field_values.append(common_utils.get_user_hash())
     if name is not None:
         # We match the job name from `job_info` for the jobs submitted after
         # #1982, and from `spot` for the jobs submitted before #1982, whose
         # job_info is not available.
-        name_filter = ('AND (job_info.name=(?) OR '
-                       '(job_info.name IS NULL AND spot.task_name=(?)))')
+        job_filter += ('AND (job_info.name=(?) OR '
+                       '(job_info.name IS NULL AND spot.task_name=(?))) ')
         field_values.extend([name, name])
     # Left outer join is used here instead of join, because the job_info does
@@ -710,7 +723,7 @@ def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
             ON spot.spot_job_id=job_info.spot_job_id
             WHERE status NOT IN
             ({statuses})
-            {name_filter}
+            {job_filter}
             ORDER BY spot.spot_job_id DESC""", field_values).fetchall()
         job_ids = [row[0] for row in rows if row[0] is not None]
         return job_ids
@@ -906,6 +919,9 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
     # existing controller before #1982, the job_info table may not exist,
     # and all the managed jobs created before will not present in the
     # job_info.
+    # Note: we will get the user_hash here, but don't try to call
+    # global_user_state.get_user() on it. This runs on the controller, which may
+    # not have the user info. Prefer to do it on the API server side.
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         rows = cursor.execute(f"""\
             SELECT *
@@ -978,14 +994,17 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
 # scheduler lock to work correctly.
-def scheduler_set_waiting(job_id: int, dag_yaml_path: str) -> None:
+def scheduler_set_waiting(job_id: int, dag_yaml_path: str, env_file_path: str,
+                          user_hash: str) -> None:
     """Do not call without holding the scheduler lock."""
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
-            'schedule_state = (?), dag_yaml_path = (?) '
+            'schedule_state = (?), dag_yaml_path = (?), env_file_path = (?), '
+            '  user_hash = (?) '
             'WHERE spot_job_id = (?) AND schedule_state = (?)',
-            (ManagedJobScheduleState.WAITING.value, dag_yaml_path, job_id,
+            (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
+             env_file_path, user_hash, job_id,
              ManagedJobScheduleState.INACTIVE.value)).rowcount
         assert updated_count == 1, (job_id, updated_count)
@@ -1085,7 +1104,7 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
     """
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         row = cursor.execute(
-            'SELECT spot_job_id, schedule_state, dag_yaml_path '
+            'SELECT spot_job_id, schedule_state, dag_yaml_path, env_file_path '
             'FROM job_info '
             'WHERE schedule_state in (?, ?) '
             'ORDER BY spot_job_id LIMIT 1',
@@ -1095,4 +1114,5 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
             'job_id': row[0],
             'schedule_state': ManagedJobScheduleState(row[1]),
             'dag_yaml_path': row[2],
+            'env_file_path': row[3],
         } if row is not None else None

sky/jobs/utils.py CHANGED Viewed

@@ -449,13 +449,15 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
     return f'{cluster_name}-{job_id}'
-def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
+def cancel_jobs_by_id(job_ids: Optional[List[int]],
+                      all_users: bool = False) -> str:
     """Cancel jobs by id.
     If job_ids is None, cancel all jobs.
     """
     if job_ids is None:
-        job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
+        job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
+            None, all_users)
     job_ids = list(set(job_ids))
     if not job_ids:
         return 'No job to cancel.'
@@ -917,6 +919,7 @@ def _get_job_status_from_tasks(
 @typing.overload
 def format_job_table(tasks: List[Dict[str, Any]],
                      show_all: bool,
+                     show_user: bool,
                      return_rows: Literal[False] = False,
                      max_jobs: Optional[int] = None) -> str:
     ...
@@ -925,6 +928,7 @@ def format_job_table(tasks: List[Dict[str, Any]],
 @typing.overload
 def format_job_table(tasks: List[Dict[str, Any]],
                      show_all: bool,
+                     show_user: bool,
                      return_rows: Literal[True],
                      max_jobs: Optional[int] = None) -> List[List[str]]:
     ...
@@ -933,6 +937,7 @@ def format_job_table(tasks: List[Dict[str, Any]],
 def format_job_table(
         tasks: List[Dict[str, Any]],
         show_all: bool,
+        show_user: bool,
         return_rows: bool = False,
         max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
     """Returns managed jobs as a formatted string.
@@ -948,13 +953,14 @@ def format_job_table(
       a list of "rows" (each of which is a list of str).
     """
     jobs = collections.defaultdict(list)
-    # Check if the tasks have user information.
-    tasks_have_user = any([task.get('user') for task in tasks])
-    if max_jobs and tasks_have_user:
+    # Check if the tasks have user information from kubernetes.
+    # This is only used for sky status --kubernetes.
+    tasks_have_k8s_user = any([task.get('user') for task in tasks])
+    if max_jobs and tasks_have_k8s_user:
         raise ValueError('max_jobs is not supported when tasks have user info.')
     def get_hash(task):
-        if tasks_have_user:
+        if tasks_have_k8s_user:
             return (task['user'], task['job_id'])
         return task['job_id']
@@ -969,10 +975,17 @@ def format_job_table(
         if not managed_job_status.is_terminal():
             status_counts[managed_job_status.value] += 1
+    user_cols: List[str] = []
+    if show_user:
+        user_cols = ['USER']
+        if show_all:
+            user_cols.append('USER_ID')
     columns = [
         'ID',
         'TASK',
         'NAME',
+        *user_cols,
         'RESOURCES',
         'SUBMITTED',
         'TOT. DURATION',
@@ -983,7 +996,7 @@ def format_job_table(
     if show_all:
         # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
         columns += ['STARTED', 'CLUSTER', 'REGION', 'SCHED. STATE', 'DETAILS']
-    if tasks_have_user:
+    if tasks_have_k8s_user:
         columns.insert(0, 'USER')
     job_table = log_utils.create_table(columns)
@@ -1006,6 +1019,22 @@ def format_job_table(
             return f'Failure: {failure_reason}'
         return '-'
+    def get_user_column_values(task: Dict[str, Any]) -> List[str]:
+        user_values: List[str] = []
+        if show_user:
+            user_name = '-'
+            user_hash = task.get('user_hash', None)
+            if user_hash:
+                user = global_user_state.get_user(user_hash)
+                user_name = user.name if user.name else '-'
+            user_values = [user_name]
+            if show_all:
+                user_values.append(user_hash if user_hash is not None else '-')
+        return user_values
     for job_hash, job_tasks in jobs.items():
         if show_all:
             schedule_state = job_tasks[0]['schedule_state']
@@ -1044,11 +1073,14 @@ def format_job_table(
             if not managed_job_status.is_terminal():
                 status_str += f' (task: {current_task_id})'
-            job_id = job_hash[1] if tasks_have_user else job_hash
+            user_values = get_user_column_values(job_tasks[0])
+            job_id = job_hash[1] if tasks_have_k8s_user else job_hash
             job_values = [
                 job_id,
                 '',
                 job_name,
+                *user_values,
                 '-',
                 submitted,
                 total_duration,
@@ -1065,7 +1097,7 @@ def format_job_table(
                     job_tasks[0]['schedule_state'],
                     generate_details(failure_reason),
                 ])
-            if tasks_have_user:
+            if tasks_have_k8s_user:
                 job_values.insert(0, job_tasks[0].get('user', '-'))
             job_table.add_row(job_values)
@@ -1075,10 +1107,12 @@ def format_job_table(
             job_duration = log_utils.readable_time_duration(
                 0, task['job_duration'], absolute=True)
             submitted = log_utils.readable_time_duration(task['submitted_at'])
+            user_values = get_user_column_values(task)
             values = [
                 task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
                 task['task_id'] if len(job_tasks) > 1 else '-',
                 task['task_name'],
+                *user_values,
                 task['resources'],
                 # SUBMITTED
                 submitted if submitted != '-' else submitted,
@@ -1103,7 +1137,7 @@ def format_job_table(
                     schedule_state,
                     generate_details(task['failure_reason']),
                 ])
-            if tasks_have_user:
+            if tasks_have_k8s_user:
                 values.insert(0, task.get('user', '-'))
             job_table.add_row(values)
@@ -1135,6 +1169,9 @@ class ManagedJobCodeGen:
     _PREFIX = textwrap.dedent("""\
         from sky.jobs import utils
         from sky.jobs import state as managed_job_state
+        from sky.jobs import constants as managed_job_constants
+        managed_job_version = managed_job_constants.MANAGED_JOBS_VERSION
         """)
     @classmethod
@@ -1146,9 +1183,17 @@ class ManagedJobCodeGen:
         return cls._build(code)
     @classmethod
-    def cancel_jobs_by_id(cls, job_ids: Optional[List[int]]) -> str:
+    def cancel_jobs_by_id(cls,
+                          job_ids: Optional[List[int]],
+                          all_users: bool = False) -> str:
         code = textwrap.dedent(f"""\
-        msg = utils.cancel_jobs_by_id({job_ids})
+        if managed_job_version < 2:
+            # For backward compatibility, since all_users is not supported
+            # before #4787. Assume th
+            # TODO(cooperc): Remove compatibility before 0.12.0
+            msg = utils.cancel_jobs_by_id({job_ids})
+        else:
+            msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users})
         print(msg, end="", flush=True)
         """)
         return cls._build(code)

sky/server/constants.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # API server version, whenever there is a change in API server that requires a
 # restart of the local API server or error out when the client does not match
 # the server version.
-API_VERSION = '1'
+API_VERSION = '2'
 # Prefix for API request names.
 REQUEST_NAME_PREFIX = 'sky.'

sky/server/requests/payloads.py CHANGED Viewed

@@ -322,6 +322,7 @@ class JobsQueueBody(RequestBody):
     """The request body for the jobs queue endpoint."""
     refresh: bool = False
     skip_finished: bool = False
+    all_users: bool = False
 class JobsCancelBody(RequestBody):
@@ -329,6 +330,7 @@ class JobsCancelBody(RequestBody):
     name: Optional[str]
     job_ids: Optional[List[int]]
     all: bool = False
+    all_users: bool = False
 class JobsLogsBody(RequestBody):

sky/templates/jobs-controller.yaml.j2 CHANGED Viewed

@@ -55,12 +55,19 @@ setup: |
 run: |
   {{ sky_activate_python_env }}
+  # Write env vars to a file
+  {%- for env_name, env_value in controller_envs.items() %}
+  echo "export {{env_name}}='{{env_value}}'" >> {{remote_env_file_path}}
+  {%- endfor %}
   # Submit the job to the scheduler.
   # Note: The job is already in the `spot` table, marked as PENDING.
   # CloudVmRayBackend._exec_code_on_head() calls
   # managed_job_codegen.set_pending() before we get here.
   python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
-    --job-id $SKYPILOT_INTERNAL_JOB_ID
+    --job-id $SKYPILOT_INTERNAL_JOB_ID \
+    --env-file {{remote_env_file_path}}
 envs:

sky/utils/common_utils.py CHANGED Viewed

@@ -774,13 +774,10 @@ def is_port_available(port: int, reuse_addr: bool = True) -> bool:
             return False
-# TODO(aylei): should be aware of cgroups
 def get_cpu_count() -> int:
-    """Get the number of CPUs.
-    If the API server is deployed as a pod in k8s cluster, we assume the
-    number of CPUs is provided by the downward API.
-    """
+    """Get the number of CPUs, with cgroup awareness."""
+    # This env-var is kept since it is still useful for limiting the resource
+    # of SkyPilot in non-containerized environments.
     cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
     if cpu_count is not None:
         try:
@@ -790,16 +787,11 @@ def get_cpu_count() -> int:
                 raise ValueError(
                     f'Failed to parse the number of CPUs from {cpu_count}'
                 ) from e
-    return psutil.cpu_count()
+    return _cpu_count()
-# TODO(aylei): should be aware of cgroups
 def get_mem_size_gb() -> float:
-    """Get the memory size in GB.
-    If the API server is deployed as a pod in k8s cluster, we assume the
-    memory size is provided by the downward API.
-    """
+    """Get the memory size in GB, with cgroup awareness."""
     mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
     if mem_size is not None:
         try:
@@ -808,4 +800,92 @@ def get_mem_size_gb() -> float:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(
                     f'Failed to parse the memory size from {mem_size}') from e
-    return psutil.virtual_memory().total / (1024**3)
+    return _mem_size_gb()
+def _cpu_count() -> int:
+    # host cpu cores (logical)
+    cpu = psutil.cpu_count()
+    # cpu affinity on Linux
+    if hasattr(os, 'sched_getaffinity'):
+        # just for safe, length of CPU set should always <= logical cpu cores
+        cpu = min(cpu, len(os.sched_getaffinity(0)))
+    cgroup_cpu = _get_cgroup_cpu_limit()
+    if cgroup_cpu is not None:
+        cpu = min(cpu, int(cgroup_cpu))
+    return cpu
+def _mem_size_gb() -> float:
+    # host memory limit
+    mem = psutil.virtual_memory().total
+    cgroup_mem = _get_cgroup_memory_limit()
+    if cgroup_mem is not None:
+        mem = min(mem, cgroup_mem)
+    return mem / (1024**3)
+# Refer to:
+# - https://docs.kernel.org/admin-guide/cgroup-v1/index.html
+# - https://docs.kernel.org/admin-guide/cgroup-v2.html
+# for the standards of handler files in cgroupv1 and v2.
+# Since all those paths are well-known standards that are unlikely to change,
+# we use string literals instead of defining extra constants.
+def _get_cgroup_cpu_limit() -> Optional[float]:
+    """Return cpu limit from cgroups in cores.
+    Returns:
+        The cpu limit in cores as a float (can be fractional), or None if there
+        is no limit in cgroups.
+    """
+    try:
+        if _is_cgroup_v2():
+            with open('/sys/fs/cgroup/cpu.max', 'r', encoding='utf-8') as f:
+                quota_str, period_str = f.read().strip().split()
+                if quota_str == 'max':
+                    return None
+                quota = float(quota_str)
+                period = float(period_str)
+                return quota / period if quota > 0 else None
+        else:
+            # cgroup v1
+            with open('/sys/fs/cgroup/cpu/cpu.cfs_quota_us',
+                      'r',
+                      encoding='utf-8') as f:
+                quota = float(f.read().strip())
+            with open('/sys/fs/cgroup/cpu/cpu.cfs_period_us',
+                      'r',
+                      encoding='utf-8') as f:
+                period = float(f.read().strip())
+            # Return unlimited if cpu quota is not set.
+            # Note that we do not use cpu.shares since it is a relative weight
+            # instead of a hard limit. It is okay to get CPU throttling under
+            # high contention. And unlimited enables the server to use as much
+            # CPU as available if there is no contention.
+            return quota / period if (quota > 0 and period > 0) else None
+    except (OSError, ValueError):
+        return None
+def _get_cgroup_memory_limit() -> Optional[int]:
+    """Return memory limit from cgroups in bytes.
+    Returns:
+        The memory limit in bytes, or None if there is no limit in cgroups.
+    """
+    try:
+        path = ('/sys/fs/cgroup/memory.max' if _is_cgroup_v2() else
+                '/sys/fs/cgroup/memory/memory.limit_in_bytes')
+        with open(path, 'r', encoding='utf-8') as f:
+            value = f.read().strip()
+            if value == 'max' or not value:
+                return None
+            limit = int(value)
+            return limit if limit > 0 else None
+    except (OSError, ValueError):
+        return None
+def _is_cgroup_v2() -> bool:
+    """Return True if the environment is running cgroup v2."""
+    return os.path.isfile('/sys/fs/cgroup/cgroup.controllers')

{skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: skypilot-nightly
-Version: 1.0.0.dev20250225
+Version: 1.0.0.dev20250227
 Summary: SkyPilot: An intercloud broker for the clouds
 Author: SkyPilot Team
 License: Apache 2.0
@@ -169,7 +169,7 @@ Dynamic: summary
 <p align="center">
   <a href="https://docs.skypilot.co/">
-    <img alt="Documentation" src="https://readthedocs.org/projects/skypilot/badge/?version=latest">
+    <img alt="Documentation" src="https://img.shields.io/badge/docs-gray?logo=readthedocs&logoColor=f5f5f5">
   </a>
   <a href="https://github.com/skypilot-org/skypilot/releases">
@@ -192,6 +192,7 @@ Dynamic: summary
 ----
 :fire: *News* :fire:
+- [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)
 - [Feb 2025] Run and serve **DeepSeek-R1 671B** using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)
 - [Feb 2025] Prepare and serve large-scale image search with **vector databases**: [**blog post**](https://blog.skypilot.co/large-scale-vector-database/), [**example**](./examples/vector_database/)
 - [Jan 2025] Launch and serve distilled models from **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** and **[Janus](https://github.com/deepseek-ai/DeepSeek-Janus)** on Kubernetes or any cloud: [**R1 example**](./llm/deepseek-r1-distilled/) and [**Janus example**](./llm/deepseek-janus/)

skypilot-nightly 1.0.0.dev20250225__py3-none-any.whl → 1.0.0.dev20250227__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250225py3-none-any.whl → 1.0.0.dev20250227py3-none-any.whl