PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250615__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250615py3-none-any.whl → 1.0.0.dev20250617py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sky/__init__.py +2 -4
sky/backends/cloud_vm_ray_backend.py +43 -60
sky/cli.py +55 -637
sky/client/cli.py +55 -637
sky/clouds/kubernetes.py +3 -0
sky/clouds/scp.py +7 -26
sky/clouds/utils/scp_utils.py +177 -124
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/jobs/controller.py +98 -31
sky/jobs/scheduler.py +37 -29
sky/jobs/server/core.py +36 -3
sky/jobs/state.py +69 -9
sky/jobs/utils.py +11 -0
sky/provision/__init__.py +1 -0
sky/provision/scp/__init__.py +15 -0
sky/provision/scp/config.py +93 -0
sky/provision/scp/instance.py +528 -0
sky/resources.py +164 -29
sky/skylet/constants.py +39 -0
sky/skylet/job_lib.py +8 -0
sky/task.py +171 -21
sky/templates/kubernetes-ray.yml.j2 +51 -4
sky/templates/scp-ray.yml.j2 +3 -50
sky/users/permission.py +19 -36
sky/utils/command_runner.py +1 -1
sky/utils/common_utils.py +16 -14
sky/utils/context.py +1 -1
sky/utils/controller_utils.py +12 -3
sky/utils/dag_utils.py +17 -4
sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
sky/utils/schemas.py +43 -5
{skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
sky/benchmark/__init__.py +0 -0
sky/benchmark/benchmark_state.py +0 -295
sky/benchmark/benchmark_utils.py +0 -641
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
sky/skylet/providers/scp/__init__.py +0 -2
sky/skylet/providers/scp/config.py +0 -149
sky/skylet/providers/scp/node_provider.py +0 -578
/sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0

sky/task.py CHANGED Viewed

@@ -121,27 +121,61 @@ def _fill_in_env_vars(
     return json.loads(yaml_field_str)
-def _check_docker_login_config(task_envs: Dict[str, str]) -> bool:
-    """Checks if there is a valid docker login config in task_envs.
+def _check_docker_login_config(task_envs: Dict[str, str],
+                               task_secrets: Dict[str, str]) -> bool:
+    """Validates a valid docker login config in task_envs and task_secrets.
-    If any of the docker login env vars is set, all of them must be set.
+    Docker login variables must be specified together either in envs OR secrets,
+    not split across both. If any of the docker login env vars is set, all of
+    them must be set in the same location.
+    Args:
+        task_envs: Environment variables
+        task_secrets: Secret variables (optional, defaults to empty dict)
     Returns:
-        True if there is a valid docker login config in task_envs.
+        True if there is a valid docker login config.
         False otherwise.
     Raises:
-        ValueError: if any of the docker login env vars is set, but not all of
-            them are set.
+        ValueError: if docker login configuration is invalid.
     """
+    if task_secrets is None:
+        task_secrets = {}
     all_keys = constants.DOCKER_LOGIN_ENV_VARS
-    existing_keys = all_keys & set(task_envs.keys())
-    if not existing_keys:
+    envs_keys = all_keys & set(task_envs.keys())
+    secrets_keys = all_keys & set(task_secrets.keys())
+    # Check if any docker variables exist
+    if not envs_keys and not secrets_keys:
         return False
-    if len(existing_keys) != len(all_keys):
+    # Check if variables are split across envs and secrets
+    if envs_keys and secrets_keys:
         with ux_utils.print_exception_no_traceback():
             raise ValueError(
-                f'If any of {", ".join(all_keys)} is set, all of them must '
-                f'be set. Missing envs: {all_keys - existing_keys}')
+                'Docker login variables must be specified together either '
+                'in envs OR secrets, not split across both. '
+                f'Found in envs: {sorted(envs_keys)}, '
+                f'Found in secrets: {sorted(secrets_keys)}')
+    # Check if all variables are present in the chosen location
+    if envs_keys:
+        if len(envs_keys) != len(all_keys):
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(
+                    'Docker login variables must be specified together '
+                    'in envs. '
+                    f'Missing from envs: {sorted(all_keys - envs_keys)}')
+    if secrets_keys:
+        if len(secrets_keys) != len(all_keys):
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(
+                    'Docker login variables must be specified together '
+                    'in secrets. '
+                    f'Missing from secrets: {sorted(all_keys - secrets_keys)}')
     return True
@@ -149,11 +183,13 @@ def _with_docker_login_config(
     resources: Union[Set['resources_lib.Resources'],
                      List['resources_lib.Resources']],
     task_envs: Dict[str, str],
+    task_secrets: Dict[str, str],
 ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
-    if not _check_docker_login_config(task_envs):
+    if not _check_docker_login_config(task_envs, task_secrets):
         return resources
-    docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(
-        task_envs)
+    envs = task_envs.copy()
+    envs.update(task_secrets)
+    docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
     def _add_docker_login_config(resources: 'resources_lib.Resources'):
         docker_image = resources.extract_docker_image()
@@ -181,8 +217,11 @@ def _with_docker_username_for_runpod(
     resources: Union[Set['resources_lib.Resources'],
                      List['resources_lib.Resources']],
     task_envs: Dict[str, str],
+    task_secrets: Dict[str, str],
 ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
-    docker_username_for_runpod = task_envs.get(
+    envs = task_envs.copy()
+    envs.update(task_secrets)
+    docker_username_for_runpod = envs.get(
         constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
     # We should not call r.copy() if docker_username_for_runpod is None,
@@ -204,6 +243,7 @@ class Task:
         setup: Optional[str] = None,
         run: Optional[CommandOrCommandGen] = None,
         envs: Optional[Dict[str, str]] = None,
+        secrets: Optional[Dict[str, str]] = None,
         workdir: Optional[str] = None,
         num_nodes: Optional[int] = None,
         # Advanced:
@@ -254,6 +294,9 @@ class Task:
             self-contained lambda.
           envs: A dictionary of environment variables to set before running the
             setup and run commands.
+          secrets: A dictionary of secret environment variables to set before
+            running the setup and run commands. These will be redacted in logs
+            and YAML output.
           workdir: The local working directory.  This directory will be synced
             to a location on the remote VM(s), and ``setup`` and ``run``
             commands will be run under that location (thus, they can rely on
@@ -275,6 +318,13 @@ class Task:
                                  storage_lib.StoreType] = {}
         self.setup = setup
         self._envs = envs or {}
+        self._secrets = secrets or {}
+        # Validate Docker login configuration early if both envs and secrets
+        # contain Docker variables
+        if self._envs or self._secrets:
+            _check_docker_login_config(self._envs, self._secrets)
         self.workdir = workdir
         self.docker_image = (docker_image if docker_image else
                              'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04')
@@ -447,6 +497,7 @@ class Task:
     def from_yaml_config(
         config: Dict[str, Any],
         env_overrides: Optional[List[Tuple[str, str]]] = None,
+        secrets_overrides: Optional[List[Tuple[str, str]]] = None,
     ) -> 'Task':
         # More robust handling for 'envs': explicitly convert keys and values to
         # str, since users may pass '123' as keys/values which will get parsed
@@ -460,6 +511,20 @@ class Task:
                 else:
                     new_envs[str(k)] = None
             config['envs'] = new_envs
+        # More robust handling for 'secrets': explicitly convert keys and values
+        # to str, since users may pass '123' as keys/values which will get
+        # parsed as int causing validate_schema() to fail.
+        secrets = config.get('secrets')
+        if secrets is not None and isinstance(secrets, dict):
+            new_secrets: Dict[str, Optional[str]] = {}
+            for k, v in secrets.items():
+                if v is not None:
+                    new_secrets[str(k)] = str(v)
+                else:
+                    new_secrets[str(k)] = None
+            config['secrets'] = new_secrets
         common_utils.validate_schema(config, schemas.get_task_schema(),
                                      'Invalid task YAML: ')
         if env_overrides is not None:
@@ -473,6 +538,12 @@ class Task:
             new_envs.update(env_overrides)
             config['envs'] = new_envs
+        if secrets_overrides is not None:
+            # Override secrets vars from CLI.
+            new_secrets = config.get('secrets', {})
+            new_secrets.update(secrets_overrides)
+            config['secrets'] = new_secrets
         for k, v in config.get('envs', {}).items():
             if v is None:
                 with ux_utils.print_exception_no_traceback():
@@ -482,6 +553,15 @@ class Task:
                         f'To set it to be empty, use an empty string ({k}: "" '
                         f'in task YAML or --env {k}="" in CLI).')
+        for k, v in config.get('secrets', {}).items():
+            if v is None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        f'Secret variable {k!r} is None. Please set a '
+                        'value for it in task YAML or with --secret flag. '
+                        f'To set it to be empty, use an empty string ({k}: "" '
+                        f'in task YAML or --secret {k}="" in CLI).')
         # Fill in any Task.envs into file_mounts (src/dst paths, storage
         # name/source).
         if config.get('file_mounts') is not None:
@@ -505,6 +585,7 @@ class Task:
             setup=config.pop('setup', None),
             num_nodes=config.pop('num_nodes', None),
             envs=config.pop('envs', None),
+            secrets=config.pop('secrets', None),
             event_callback=config.pop('event_callback', None),
             file_mounts_mapping=config.pop('file_mounts_mapping', None),
         )
@@ -687,6 +768,10 @@ class Task:
     def envs(self) -> Dict[str, str]:
         return self._envs
+    @property
+    def secrets(self) -> Dict[str, str]:
+        return self._secrets
     def update_envs(
             self, envs: Union[None, List[Tuple[str, str]],
                               Dict[str, str]]) -> 'Task':
@@ -727,17 +812,70 @@ class Task:
         # If the update_envs() is called after set_resources(), we need to
         # manually update docker login config in task resources, in case the
         # docker login envs are newly added.
-        if _check_docker_login_config(self._envs):
+        if _check_docker_login_config(self._envs, self._secrets):
             self.resources = _with_docker_login_config(self.resources,
-                                                       self._envs)
+                                                       self._envs,
+                                                       self._secrets)
         self.resources = _with_docker_username_for_runpod(
-            self.resources, self._envs)
+            self.resources, self._envs, self._secrets)
+        return self
+    def update_secrets(
+            self, secrets: Union[None, List[Tuple[str, str]],
+                                 Dict[str, str]]) -> 'Task':
+        """Updates secret env vars for use inside the setup/run commands.
+        Args:
+          secrets: (optional) either a list of ``(secret_name, value)`` or a
+            dict ``{secret_name: value}``.
+        Returns:
+          self: The current task, with secrets updated.
+        Raises:
+          ValueError: if various invalid inputs errors are detected.
+        """
+        if secrets is None:
+            secrets = {}
+        if isinstance(secrets, (list, tuple)):
+            keys = set(secret[0] for secret in secrets)
+            if len(keys) != len(secrets):
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError('Duplicate secret keys provided.')
+            secrets = dict(secrets)
+        if isinstance(secrets, dict):
+            for key in secrets:
+                if not isinstance(key, str):
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError('Secret keys must be strings.')
+                if not common_utils.is_valid_env_var(key):
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(f'Invalid secret key: {key}')
+        else:
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(
+                    'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
+                    f'{secrets}')
+        self._secrets.update(secrets)
+        # Validate Docker login configuration if needed
+        if _check_docker_login_config(self._envs, self._secrets):
+            self.resources = _with_docker_login_config(self.resources,
+                                                       self._envs,
+                                                       self._secrets)
+        self.resources = _with_docker_username_for_runpod(
+            self.resources, self._envs, self._secrets)
         return self
     @property
     def use_spot(self) -> bool:
         return any(r.use_spot for r in self.resources)
+    @property
+    def envs_and_secrets(self) -> Dict[str, str]:
+        envs = self.envs.copy()
+        envs.update(self.secrets)
+        return envs
     def set_inputs(self, inputs: str,
                    estimated_size_gigabytes: float) -> 'Task':
         # E.g., 's3://bucket', 'gs://bucket', or None.
@@ -796,10 +934,11 @@ class Task:
         if isinstance(resources, sky.Resources):
             resources = {resources}
         # TODO(woosuk): Check if the resources are None.
-        self.resources = _with_docker_login_config(resources, self.envs)
+        self.resources = _with_docker_login_config(resources, self.envs,
+                                                   self.secrets)
         # Only have effect on RunPod.
         self.resources = _with_docker_username_for_runpod(
-            self.resources, self.envs)
+            self.resources, self.envs, self.secrets)
         # Evaluate if the task requires FUSE and set the requires_fuse flag
         for _, storage_obj in self.storage_mounts.items():
@@ -1266,7 +1405,7 @@ class Task:
                 d[k] = v
         return d
-    def to_yaml_config(self) -> Dict[str, Any]:
+    def to_yaml_config(self, redact_secrets: bool = True) -> Dict[str, Any]:
         """Returns a yaml-style dict representation of the task.
         INTERNAL: this method is internal-facing.
@@ -1314,8 +1453,19 @@ class Task:
         add_if_not_none('workdir', self.workdir)
         add_if_not_none('event_callback', self.event_callback)
         add_if_not_none('run', self.run)
+        # Add envs without redaction
         add_if_not_none('envs', self.envs, no_empty=True)
+        # Add secrets with redaction if requested
+        secrets = self.secrets
+        if secrets and redact_secrets:
+            secrets = {
+                k: '<redacted>' if isinstance(v, str) else v
+                for k, v in secrets.items()
+            }
+        add_if_not_none('secrets', secrets, no_empty=True)
         add_if_not_none('file_mounts', {})
         if self.file_mounts is not None:

sky/templates/kubernetes-ray.yml.j2 CHANGED Viewed

@@ -632,19 +632,66 @@ available_node_types:
               {% if high_availability %}
               mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
               if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
+                SKYPILOT_HA_RECOVERY_LOG="/tmp/ha_recovery.log"
+                echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
+                start_time=$SECONDS
+                retry_count=0
+                # Wait for Ray to be ready, as the following commands is depending on Ray.
+                GET_RAY_STATUS_CMD=$({{sky_python_cmd}} -c 'from sky.provision import instance_setup; print(instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND)')
+                while true; do
+                  retry_count=$((retry_count + 1))
+                  current_duration=$(( SECONDS - start_time ))
+                  echo "Attempt $retry_count to get Ray status after $current_duration seconds..." >> $SKYPILOT_HA_RECOVERY_LOG
+                  bash --login -c "$GET_RAY_STATUS_CMD"
+                  if [ $? -eq 0 ]; then
+                    wait_duration=$(( SECONDS - start_time ))
+                    echo "Ray ready after waiting $wait_duration seconds (took $retry_count attempts)" >> $SKYPILOT_HA_RECOVERY_LOG
+                    break
+                  fi
+                  echo "Waiting for Ray to be ready..." >> $SKYPILOT_HA_RECOVERY_LOG
+                  sleep 2
+                done
                 # ! Keep this aligned with `CloudVmRayBackend._setup()`
-                # Suppose all `task.setup` are the same for skyserve controller task.
+                # Suppose all `task.setup` are the same for sky serve / managed jobs controller task.
                 # So be careful for compatibility issue once you change it.
                 chmod +x {{k8s_high_availability_deployment_setup_script_path}}
                 /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
-                echo "=== Controller setup commands completed for recovery ==="
+                echo "=== Controller setup commands completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
+                touch {{k8s_high_availability_restarting_signal_file}}
+                # Get all in-progress jobs from managed jobs controller. We skip any jobs that are already done.
+                # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
+                # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
+                # will delete the service from the database after it is terminated so everything in the database is running.
+                ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.get_managed_jobs(); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
+                if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
+                  read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
+                fi
                 for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
+                  # This is the cluster job id on managed jobs controller, but it is guaranteed to be the same as the managed job id,
+                  # so we directly use it here. See `CloudVmRayBackend._exec_code_on_head::_dump_code_to_file` for more details.
+                  JOB_ID=$(basename $file | sed 's/sky_job_//')
+                  # If the list of in-progress jobs is not None (meaning this is a managed job HA controller) and job is not in-progress, skip.
+                  if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
+                    if [[ ! " ${ALL_IN_PROGRESS_JOBS_SEQ[@]} " =~ " ${JOB_ID} " ]]; then
+                      continue
+                    fi
+                  fi
                   # ! Keep this aligned with `CloudVmRayBackend._execute()`
                   chmod +x $file
+                  # TODO(tian): This logic may run a lot of things if the jobs controller previously had many jobs.
+                  # We should do more tests and make sure it will scale well.
                   /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
-                  echo "=== Controller task run for service (file: $file) completed for recovery ==="
+                  echo "=== Controller task run for service / job (file: $file) completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
                 done
+                rm {{k8s_high_availability_restarting_signal_file}}
+                duration=$(( SECONDS - start_time ))
+                echo "HA recovery completed at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
+                echo "Total recovery time: $duration seconds" >> $SKYPILOT_HA_RECOVERY_LOG
               fi
               touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready

sky/templates/scp-ray.yml.j2 CHANGED Viewed

@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
 provider:
   type: external
-  module: sky.skylet.providers.scp.SCPNodeProvider
+  module: sky.provision.scp
   region: {{region}}
   cache_stopped_nodes: True
@@ -24,19 +24,6 @@ available_node_types:
       InstanceType: {{instance_type}}
       imageId: {{image_id}}
       diskSize: {{disk_size}}
-{% if num_nodes > 1 %}
-  ray_worker_default:
-    min_workers: {{num_nodes - 1}}
-    max_workers: {{num_nodes - 1}}
-    resources: {}
-    node_config:
-      AuthorizedKey: |
-        skypilot:ssh_public_key_content
-      InstanceType: {{instance_type}}
-      imageId: {{image_id}}
-      diskSize: {{disk_size}}
-{%- endif %}
 head_node_type: ray_head_default
@@ -50,10 +37,6 @@ file_mounts: {
 {%- endfor %}
 }
-rsync_exclude: []
-initialization_commands: []
 # List of shell commands to run to set up nodes.
 # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
 # connection, which is expensive. Try your best to co-locate commands into fewer
@@ -77,36 +60,6 @@ setup_commands:
     sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
-    {{ ssh_max_sessions_config }}
-# Command to start ray on the head node. You don't need to change this.
-# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
-# connection, which is expensive. Try your best to co-locate commands into fewer
-# items! The same comment applies for worker_start_ray_commands.
-#
-# Increment the following for catching performance bugs easier:
-#   current num items (num SSH connections): 1
-head_start_ray_commands:
-  # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
-  # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
-  # all the sessions to be reloaded. This is a workaround.
-  - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
-    which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
-    {{dump_port_command}}; {{ray_head_wait_initialized_command}}
-{%- if num_nodes > 1 %}
-worker_start_ray_commands:
-  - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
-    which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
-{%- else %}
-worker_start_ray_commands: []
-{%- endif %}
-head_node: {}
-worker_nodes: {}
-# These fields are required for external cloud providers.
-head_setup_commands: []
-worker_setup_commands: []
-cluster_synced_files: []
-file_mounts_sync_continuously: False
+# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
+# We do not need to list it here anymore.

sky/users/permission.py CHANGED Viewed

@@ -30,36 +30,28 @@ class PermissionService:
     """Permission service for SkyPilot API Server."""
     def __init__(self):
-        self.enforcer = None
-        self.init_lock = threading.Lock()
-    def _lazy_initialize(self):
-        if self.enforcer is not None:
-            return
-        with self.init_lock:
-            if self.enforcer is not None:
-                return
-            global _enforcer_instance
-            if _enforcer_instance is None:
-                # For different threads, we share the same enforcer instance.
-                with _lock:
-                    if _enforcer_instance is None:
-                        _enforcer_instance = self
-                        engine = global_user_state.initialize_and_get_db()
-                        adapter = sqlalchemy_adapter.Adapter(engine)
-                        model_path = os.path.join(os.path.dirname(__file__),
-                                                  'model.conf')
-                        enforcer = casbin.Enforcer(model_path, adapter)
-                        self.enforcer = enforcer
-                    else:
-                        self.enforcer = _enforcer_instance.enforcer
-            else:
-                self.enforcer = _enforcer_instance.enforcer
-            with _policy_lock():
-                self._maybe_initialize_policies()
+        global _enforcer_instance
+        if _enforcer_instance is None:
+            # For different threads, we share the same enforcer instance.
+            with _lock:
+                if _enforcer_instance is None:
+                    _enforcer_instance = self
+                    engine = global_user_state.initialize_and_get_db()
+                    adapter = sqlalchemy_adapter.Adapter(engine)
+                    model_path = os.path.join(os.path.dirname(__file__),
+                                              'model.conf')
+                    enforcer = casbin.Enforcer(model_path, adapter)
+                    self.enforcer = enforcer
+                else:
+                    self.enforcer = _enforcer_instance.enforcer
+        else:
+            self.enforcer = _enforcer_instance.enforcer
+        with _policy_lock():
+            self._maybe_initialize_policies()
     def _maybe_initialize_policies(self) -> None:
         """Initialize policies if they don't already exist."""
+        # TODO(zhwu): we should avoid running this on client side.
         logger.debug(f'Initializing policies in process: {os.getpid()}')
         self._load_policy_no_lock()
@@ -138,7 +130,6 @@ class PermissionService:
     def add_user_if_not_exists(self, user_id: str) -> None:
         """Add user role relationship."""
-        self._lazy_initialize()
         with _policy_lock():
             self._add_user_if_not_exists_no_lock(user_id)
@@ -158,7 +149,6 @@ class PermissionService:
     def update_role(self, user_id: str, new_role: str) -> None:
         """Update user role relationship."""
-        self._lazy_initialize()
         with _policy_lock():
             # Get current roles
             self._load_policy_no_lock()
@@ -191,7 +181,6 @@ class PermissionService:
         Returns:
             A list of role names that the user has.
         """
-        self._lazy_initialize()
         self._load_policy_no_lock()
         return self.enforcer.get_roles_for_user(user_id)
@@ -204,7 +193,6 @@ class PermissionService:
         # it is a hot path in every request. It is ok to have a stale policy,
         # as long as it is eventually consistent.
         # self._load_policy_no_lock()
-        self._lazy_initialize()
         return self.enforcer.enforce(user_id, path, method)
     def _load_policy_no_lock(self):
@@ -213,7 +201,6 @@ class PermissionService:
     def load_policy(self):
         """Load policy from storage with lock."""
-        self._lazy_initialize()
         with _policy_lock():
             self._load_policy_no_lock()
@@ -229,7 +216,6 @@ class PermissionService:
         For public workspaces, the permission is granted via a wildcard policy
         ('*').
         """
-        self._lazy_initialize()
         if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
             # When it is not on API server, we allow all users to access all
             # workspaces, as the workspace check has been done on API server.
@@ -257,7 +243,6 @@ class PermissionService:
                    For public workspaces, this should be ['*'].
                    For private workspaces, this should be specific user IDs.
         """
-        self._lazy_initialize()
         with _policy_lock():
             for user in users:
                 logger.debug(f'Adding workspace policy: user={user}, '
@@ -275,7 +260,6 @@ class PermissionService:
                    For public workspaces, this should be ['*'].
                    For private workspaces, this should be specific user IDs.
         """
-        self._lazy_initialize()
         with _policy_lock():
             self._load_policy_no_lock()
             # Remove all existing policies for this workspace
@@ -289,7 +273,6 @@ class PermissionService:
     def remove_workspace_policy(self, workspace_name: str) -> None:
         """Remove workspace policy."""
-        self._lazy_initialize()
         with _policy_lock():
             self.enforcer.remove_filtered_policy(1, workspace_name)
             self.enforcer.save_policy()

sky/utils/command_runner.py CHANGED Viewed

@@ -561,7 +561,7 @@ class SSHCommandRunner(CommandRunner):
         if self.ssh_control_name is not None:
             control_path = _ssh_control_path(self.ssh_control_name)
             if control_path is not None:
-                # Suppress the `Exit request sent.` output for this comamnd
+                # Suppress the `Exit request sent.` output for this command
                 # which would interrupt the CLI spinner.
                 cmd = (f'ssh -O exit -S {control_path}/%C '
                        f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')

skypilot-nightly 1.0.0.dev20250615__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250615py3-none-any.whl → 1.0.0.dev20250617py3-none-any.whl