PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251014__py3-none-any.whl → 1.0.0.dev20251016__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251014py3-none-any.whl → 1.0.0.dev20251016py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (51) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +29 -15
sky/backends/cloud_vm_ray_backend.py +30 -13
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/exceptions.py +13 -1
sky/jobs/constants.py +1 -1
sky/jobs/scheduler.py +2 -4
sky/jobs/server/core.py +2 -1
sky/jobs/server/server.py +5 -3
sky/jobs/state.py +12 -6
sky/jobs/utils.py +8 -2
sky/provision/common.py +2 -0
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/instance.py +34 -10
sky/provision/kubernetes/utils.py +9 -0
sky/schemas/generated/jobsv1_pb2.py +52 -52
sky/schemas/generated/jobsv1_pb2.pyi +4 -2
sky/serve/server/server.py +1 -0
sky/server/requests/executor.py +51 -15
sky/server/requests/preconditions.py +2 -4
sky/server/requests/requests.py +14 -23
sky/server/requests/threads.py +106 -0
sky/server/rest.py +36 -18
sky/server/server.py +24 -0
sky/skylet/constants.py +1 -1
sky/skylet/services.py +3 -1
sky/utils/asyncio_utils.py +18 -0
sky/utils/context_utils.py +2 -0
{skypilot_nightly-1.0.0.dev20251014.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/METADATA +37 -36
{skypilot_nightly-1.0.0.dev20251014.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/RECORD +51 -49
/sky/dashboard/out/_next/static/{9Fek73R28lDp1A5J4N7g7 → pbgtEUoCUdmJyLHjgln5A}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{9Fek73R28lDp1A5J4N7g7 → pbgtEUoCUdmJyLHjgln5A}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251014.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251014.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251014.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251014.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -2148,8 +2148,12 @@ class ManagedJobCodeGen:
         return cls._build(code)
     @classmethod
-    def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
-                    workspace: str, entrypoint: str) -> str:
+    def set_pending(cls,
+                    job_id: int,
+                    managed_job_dag: 'dag_lib.Dag',
+                    workspace: str,
+                    entrypoint: str,
+                    user_hash: Optional[str] = None) -> str:
         dag_name = managed_job_dag.name
         pool = managed_job_dag.pool
         # Add the managed job to queue table.
@@ -2166,6 +2170,8 @@ class ManagedJobCodeGen:
                     pool_hash = serve_state.get_service_hash({pool!r})
                 set_job_info_kwargs['pool'] = {pool!r}
                 set_job_info_kwargs['pool_hash'] = pool_hash
+            if managed_job_version >= 11:
+                set_job_info_kwargs['user_hash'] = {user_hash!r}
             managed_job_state.set_job_info(
                 {job_id}, {dag_name!r}, **set_job_info_kwargs)
             """)

sky/provision/common.py CHANGED Viewed

@@ -97,6 +97,8 @@ class InstanceInfo:
     external_ip: Optional[str]
     tags: Dict[str, str]
     ssh_port: int = 22
+    # The internal service address of the instance on Kubernetes.
+    internal_svc: Optional[str] = None
     def get_feasible_ip(self) -> str:
         """Get the most feasible IPs of the instance. This function returns

sky/provision/instance_setup.py CHANGED Viewed

@@ -434,8 +434,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
         # use the external IP of the head node.
         use_external_ip = cluster_info.custom_ray_options.pop(
             'use_external_ip', False)
-    head_ip = (head_instance.internal_ip
-               if not use_external_ip else head_instance.external_ip)
+    if use_external_ip:
+        head_ip = head_instance.external_ip
+    else:
+        # For Kubernetes, use the internal service address of the head node.
+        # Keep this consistent with the logic in kubernetes-ray.yml.j2
+        if head_instance.internal_svc:
+            head_ip = head_instance.internal_svc
+        else:
+            head_ip = head_instance.internal_ip
     ray_cmd = ray_worker_start_command(custom_resource,
                                        cluster_info.custom_ray_options,

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -959,12 +959,19 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
     def _create_resource_thread(i: int):
         pod_spec_copy = copy.deepcopy(pod_spec)
-        if head_pod_name is None and i == 0:
-            # First pod should be head if no head exists
-            pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
-            head_selector = _head_service_selector(cluster_name_on_cloud)
-            pod_spec_copy['metadata']['labels'].update(head_selector)
-            pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
+        # 0 is for head pod, while 1+ is for worker pods.
+        if i == 0:
+            if head_pod_name is None:
+                # First pod should be head if no head exists
+                pod_spec_copy['metadata']['labels'].update(
+                    constants.HEAD_NODE_TAGS)
+                head_selector = _head_service_selector(cluster_name_on_cloud)
+                pod_spec_copy['metadata']['labels'].update(head_selector)
+                pod_spec_copy['metadata'][
+                    'name'] = f'{cluster_name_on_cloud}-head'
+            else:
+                # If head pod already exists, we skip creating it.
+                return
         else:
             # Worker pods
             pod_spec_copy['metadata']['labels'].update(
@@ -1105,9 +1112,16 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
                 'and then up the cluster again.')
             raise exceptions.InconsistentHighAvailabilityError(message)
-    # Create pods in parallel
-    created_resources = subprocess_utils.run_in_parallel(
-        _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
+    created_resources = []
+    if to_start_count > 0:
+        # Create pods in parallel.
+        # Use `config.count` instead of `to_start_count` to keep the index of
+        # the Pods consistent especially for the case where some Pods are down
+        # due to node failure or manual termination, etc. and then launch
+        # again to create the Pods back.
+        # The existing Pods will be skipped in _create_resource_thread.
+        created_resources = subprocess_utils.run_in_parallel(
+            _create_resource_thread, list(range(config.count)), _NUM_THREADS)
     if to_create_deployment:
         deployments = copy.deepcopy(created_resources)
@@ -1350,6 +1364,9 @@ def get_cluster_info(
                 external_ip=None,
                 ssh_port=port,
                 tags=pod.metadata.labels,
+                # TODO(hailong): `cluster.local` may need to be configurable
+                # Service name is same as the pod name for now.
+                internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
             )
         ]
         if _is_head(pod):
@@ -1388,6 +1405,13 @@ def get_cluster_info(
     logger.debug(
         f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
+    # cpu_request may be a string like `100m`, need to parse and convert
+    num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
+    # 'num-cpus' for ray must be an integer, but we should not set it to 0 if
+    # cpus is <1.
+    # Keep consistent with the logic in clouds/kubernetes.py
+    str_cpus = str(max(int(num_cpus), 1))
     return common.ClusterInfo(
         instances=pods,
         head_instance_id=head_pod_name,
@@ -1397,7 +1421,7 @@ def get_cluster_info(
         # problems for other pods.
         custom_ray_options={
             'object-store-memory': 500000000,
-            'num-cpus': cpu_request,
+            'num-cpus': str_cpus,
         },
         provider_name='kubernetes',
         provider_config=provider_config)

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -2241,6 +2241,15 @@ def get_kube_config_context_namespace(
         return DEFAULT_NAMESPACE
+def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
+    if not resource_str:
+        return 0.0
+    if resource_str[-1] == 'm':
+        return float(resource_str[:-1]) / 1000
+    else:
+        return float(resource_str)
 def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
     resource_str = str(resource_qty_str)
     if resource_str[-1] == 'm':

sky/schemas/generated/jobsv1_pb2.py CHANGED Viewed

@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xab\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_id\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
   _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
-  _globals['_JOBSTATUS']._serialized_start=2185
-  _globals['_JOBSTATUS']._serialized_end=2454
+  _globals['_JOBSTATUS']._serialized_start=2219
+  _globals['_JOBSTATUS']._serialized_end=2488
   _globals['_ADDJOBREQUEST']._serialized_start=48
   _globals['_ADDJOBREQUEST']._serialized_end=181
   _globals['_ADDJOBRESPONSE']._serialized_start=183
@@ -34,53 +34,53 @@ if not _descriptor._USE_C_DESCRIPTORS:
   _globals['_QUEUEJOBREQUEST']._serialized_start=235
   _globals['_QUEUEJOBREQUEST']._serialized_end=414
   _globals['_MANAGEDJOBINFO']._serialized_start=417
-  _globals['_MANAGEDJOBINFO']._serialized_end=554
-  _globals['_MANAGEDJOBTASK']._serialized_start=556
-  _globals['_MANAGEDJOBTASK']._serialized_end=649
-  _globals['_QUEUEJOBRESPONSE']._serialized_start=651
-  _globals['_QUEUEJOBRESPONSE']._serialized_end=669
-  _globals['_UPDATESTATUSREQUEST']._serialized_start=671
-  _globals['_UPDATESTATUSREQUEST']._serialized_end=692
-  _globals['_UPDATESTATUSRESPONSE']._serialized_start=694
-  _globals['_UPDATESTATUSRESPONSE']._serialized_end=716
-  _globals['_GETJOBQUEUEREQUEST']._serialized_start=718
-  _globals['_GETJOBQUEUEREQUEST']._serialized_end=794
-  _globals['_JOBINFO']._serialized_start=797
-  _globals['_JOBINFO']._serialized_end=1088
-  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1090
-  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1143
-  _globals['_CANCELJOBSREQUEST']._serialized_start=1145
-  _globals['_CANCELJOBSREQUEST']._serialized_end=1239
-  _globals['_CANCELJOBSRESPONSE']._serialized_start=1241
-  _globals['_CANCELJOBSRESPONSE']._serialized_end=1288
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1290
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1320
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1322
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1353
-  _globals['_TAILLOGSREQUEST']._serialized_start=1355
-  _globals['_TAILLOGSREQUEST']._serialized_end=1482
-  _globals['_TAILLOGSRESPONSE']._serialized_start=1484
-  _globals['_TAILLOGSRESPONSE']._serialized_end=1539
-  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1541
-  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1579
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1582
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1746
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1676
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1746
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1748
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1813
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1815
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1868
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1870
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1931
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1933
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1982
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1984
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2027
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2030
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2182
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2133
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2182
-  _globals['_JOBSSERVICE']._serialized_start=2457
-  _globals['_JOBSSERVICE']._serialized_end=3370
+  _globals['_MANAGEDJOBINFO']._serialized_end=588
+  _globals['_MANAGEDJOBTASK']._serialized_start=590
+  _globals['_MANAGEDJOBTASK']._serialized_end=683
+  _globals['_QUEUEJOBRESPONSE']._serialized_start=685
+  _globals['_QUEUEJOBRESPONSE']._serialized_end=703
+  _globals['_UPDATESTATUSREQUEST']._serialized_start=705
+  _globals['_UPDATESTATUSREQUEST']._serialized_end=726
+  _globals['_UPDATESTATUSRESPONSE']._serialized_start=728
+  _globals['_UPDATESTATUSRESPONSE']._serialized_end=750
+  _globals['_GETJOBQUEUEREQUEST']._serialized_start=752
+  _globals['_GETJOBQUEUEREQUEST']._serialized_end=828
+  _globals['_JOBINFO']._serialized_start=831
+  _globals['_JOBINFO']._serialized_end=1122
+  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1124
+  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1177
+  _globals['_CANCELJOBSREQUEST']._serialized_start=1179
+  _globals['_CANCELJOBSREQUEST']._serialized_end=1273
+  _globals['_CANCELJOBSRESPONSE']._serialized_start=1275
+  _globals['_CANCELJOBSRESPONSE']._serialized_end=1322
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1324
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1354
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1356
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1387
+  _globals['_TAILLOGSREQUEST']._serialized_start=1389
+  _globals['_TAILLOGSREQUEST']._serialized_end=1516
+  _globals['_TAILLOGSRESPONSE']._serialized_start=1518
+  _globals['_TAILLOGSRESPONSE']._serialized_end=1573
+  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1575
+  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1613
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1616
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1780
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1710
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1780
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1782
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1847
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1849
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1902
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1904
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1965
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1967
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=2016
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=2018
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2061
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2064
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2216
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2167
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2216
+  _globals['_JOBSSERVICE']._serialized_start=2491
+  _globals['_JOBSSERVICE']._serialized_end=3404
 # @@protoc_insertion_point(module_scope)

sky/schemas/generated/jobsv1_pb2.pyi CHANGED Viewed

@@ -66,18 +66,20 @@ class QueueJobRequest(_message.Message):
     def __init__(self, job_id: _Optional[int] = ..., codegen: _Optional[str] = ..., script_path: _Optional[str] = ..., remote_log_dir: _Optional[str] = ..., managed_job: _Optional[_Union[ManagedJobInfo, _Mapping]] = ...) -> None: ...
 class ManagedJobInfo(_message.Message):
-    __slots__ = ("name", "pool", "workspace", "entrypoint", "tasks")
+    __slots__ = ("name", "pool", "workspace", "entrypoint", "tasks", "user_id")
     NAME_FIELD_NUMBER: _ClassVar[int]
     POOL_FIELD_NUMBER: _ClassVar[int]
     WORKSPACE_FIELD_NUMBER: _ClassVar[int]
     ENTRYPOINT_FIELD_NUMBER: _ClassVar[int]
     TASKS_FIELD_NUMBER: _ClassVar[int]
+    USER_ID_FIELD_NUMBER: _ClassVar[int]
     name: str
     pool: str
     workspace: str
     entrypoint: str
     tasks: _containers.RepeatedCompositeFieldContainer[ManagedJobTask]
-    def __init__(self, name: _Optional[str] = ..., pool: _Optional[str] = ..., workspace: _Optional[str] = ..., entrypoint: _Optional[str] = ..., tasks: _Optional[_Iterable[_Union[ManagedJobTask, _Mapping]]] = ...) -> None: ...
+    user_id: str
+    def __init__(self, name: _Optional[str] = ..., pool: _Optional[str] = ..., workspace: _Optional[str] = ..., entrypoint: _Optional[str] = ..., tasks: _Optional[_Iterable[_Union[ManagedJobTask, _Mapping]]] = ..., user_id: _Optional[str] = ...) -> None: ...
 class ManagedJobTask(_message.Message):
     __slots__ = ("task_id", "name", "resources_str", "metadata_json")

sky/serve/server/server.py CHANGED Viewed

@@ -98,6 +98,7 @@ async def tail_logs(
     request: fastapi.Request, log_body: payloads.ServeLogsBody,
     background_tasks: fastapi.BackgroundTasks
 ) -> fastapi.responses.StreamingResponse:
+    executor.check_request_thread_executor_available()
     request_task = executor.prepare_request(
         request_id=request.state.request_id,
         request_name='serve.logs',

sky/server/requests/executor.py CHANGED Viewed

@@ -48,6 +48,7 @@ from sky.server.requests import payloads
 from sky.server.requests import preconditions
 from sky.server.requests import process
 from sky.server.requests import requests as api_requests
+from sky.server.requests import threads
 from sky.server.requests.queues import local_queue
 from sky.server.requests.queues import mp_queue
 from sky.skylet import constants
@@ -81,23 +82,28 @@ logger = sky_logging.init_logger(__name__)
 # platforms, including macOS.
 multiprocessing.set_start_method('spawn', force=True)
-# Max threads that is equivalent to the number of thread workers in the
-# default thread pool executor of event loop.
-_REQUEST_THREADS_LIMIT = min(32, (os.cpu_count() or 0) + 4)
+# An upper limit of max threads for request execution per server process that
+# unlikely to be reached to allow higher concurrency while still prevent the
+# server process become overloaded.
+_REQUEST_THREADS_LIMIT = 128
 _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
-# A dedicated thread pool executor for synced requests execution in coroutine
-_REQUEST_THREAD_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
+# A dedicated thread pool executor for synced requests execution in coroutine to
+# avoid:
+# 1. blocking the event loop;
+# 2. exhausting the default thread pool executor of event loop;
+_REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
-def get_request_thread_executor() -> concurrent.futures.ThreadPoolExecutor:
+def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
     """Lazy init and return the request thread executor for current process."""
     global _REQUEST_THREAD_EXECUTOR
     if _REQUEST_THREAD_EXECUTOR is not None:
         return _REQUEST_THREAD_EXECUTOR
     with _REQUEST_THREAD_EXECUTOR_LOCK:
         if _REQUEST_THREAD_EXECUTOR is None:
-            _REQUEST_THREAD_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
+            _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
+                name='request_thread_executor',
                 max_workers=_REQUEST_THREADS_LIMIT)
         return _REQUEST_THREAD_EXECUTOR
@@ -561,6 +567,21 @@ class CoroutineTask:
             pass
+def check_request_thread_executor_available() -> None:
+    """Check if the request thread executor is available.
+    This is a best effort check to hint the client to retry other server
+    processes when there is no avaiable thread worker in current one. But
+    a request may pass this check and still cannot get worker on execution
+    time due to race condition. In this case, the client will see a failed
+    request instead of retry.
+    TODO(aylei): this can be refined with a refactor of our coroutine
+    execution flow.
+    """
+    get_request_thread_executor().check_available()
 def execute_request_in_coroutine(
         request: api_requests.Request) -> CoroutineTask:
     """Execute a request in current event loop.
@@ -575,6 +596,18 @@ def execute_request_in_coroutine(
     return CoroutineTask(task)
+def _execute_with_config_override(func: Callable,
+                                  request_body: payloads.RequestBody,
+                                  request_id: str, request_name: str,
+                                  **kwargs) -> Any:
+    """Execute a function with env and config override inside a thread."""
+    # Override the environment and config within this thread's context,
+    # which gets copied when we call to_thread.
+    with override_request_env_and_config(request_body, request_id,
+                                         request_name):
+        return func(**kwargs)
 async def _execute_request_coroutine(request: api_requests.Request):
     """Execute a request in current event loop.
@@ -592,14 +625,17 @@ async def _execute_request_coroutine(request: api_requests.Request):
         request_task.status = api_requests.RequestStatus.RUNNING
     # Redirect stdout and stderr to the request log path.
     original_output = ctx.redirect_log(request.log_path)
-    # Override environment variables that backs env_options.Options
-    # TODO(aylei): compared to process executor, running task in coroutine has
-    # two issues to fix:
-    # 1. skypilot config is not contextual
-    # 2. envs that read directly from os.environ are not contextual
-    ctx.override_envs(request_body.env_vars)
-    fut: asyncio.Future = context_utils.to_thread_with_executor(
-        get_request_thread_executor(), func, **request_body.to_kwargs())
+    try:
+        fut: asyncio.Future = context_utils.to_thread_with_executor(
+            get_request_thread_executor(), _execute_with_config_override, func,
+            request_body, request.request_id, request.name,
+            **request_body.to_kwargs())
+    except Exception as e:  # pylint: disable=broad-except
+        ctx.redirect_log(original_output)
+        api_requests.set_request_failed(request.request_id, e)
+        logger.error(f'Failed to run request {request.request_id} due to '
+                     f'{common_utils.format_exception(e)}')
+        return
     async def poll_task(request_id: str) -> bool:
         req_status = await api_requests.get_request_status_async(request_id)

sky/server/requests/preconditions.py CHANGED Viewed

@@ -112,10 +112,8 @@ class Precondition(abc.ABC):
                     return True
                 if status_msg is not None and status_msg != last_status_msg:
                     # Update the status message if it has changed.
-                    async with api_requests.update_request_async(
-                            self.request_id) as req:
-                        assert req is not None, self.request_id
-                        req.status_msg = status_msg
+                    await api_requests.update_status_msg_async(
+                        self.request_id, status_msg)
                     last_status_msg = status_msg
             except (Exception, SystemExit, KeyboardInterrupt) as e:  # pylint: disable=broad-except
                 api_requests.set_request_failed(self.request_id, e)

sky/server/requests/requests.py CHANGED Viewed

@@ -14,8 +14,8 @@ import sqlite3
 import threading
 import time
 import traceback
-from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
-                    NamedTuple, Optional, Tuple)
+from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
+                    Tuple)
 import anyio
 import colorama
@@ -32,6 +32,7 @@ from sky.server import daemons
 from sky.server.requests import payloads
 from sky.server.requests.serializers import decoders
 from sky.server.requests.serializers import encoders
+from sky.utils import asyncio_utils
 from sky.utils import common_utils
 from sky.utils import ux_utils
 from sky.utils.db import db_utils
@@ -578,27 +579,14 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
 @init_db
 @metrics_lib.time_me
-def update_request_async(
-        request_id: str) -> AsyncContextManager[Optional[Request]]:
-    """Async version of update_request.
-    Returns an async context manager that yields the request record and
-    persists any in-place updates upon exit.
-    """
-    @contextlib.asynccontextmanager
-    async def _cm():
-        # Acquire the lock to avoid race conditions between multiple request
-        # operations, e.g. execute and cancel.
-        async with filelock.AsyncFileLock(request_lock_path(request_id)):
-            request = await _get_request_no_lock_async(request_id)
-            try:
-                yield request
-            finally:
-                if request is not None:
-                    await _add_or_update_request_no_lock_async(request)
-    return _cm()
+@asyncio_utils.shield
+async def update_status_msg_async(request_id: str, status_msg: str) -> None:
+    """Update the status message of a request"""
+    async with filelock.AsyncFileLock(request_lock_path(request_id)):
+        request = await _get_request_no_lock_async(request_id)
+        if request is not None:
+            request.status_msg = status_msg
+            await _add_or_update_request_no_lock_async(request)
 _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
@@ -651,8 +639,10 @@ def get_request(request_id: str) -> Optional[Request]:
 @init_db_async
 @metrics_lib.time_me_async
+@asyncio_utils.shield
 async def get_request_async(request_id: str) -> Optional[Request]:
     """Async version of get_request."""
+    # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
     async with filelock.AsyncFileLock(request_lock_path(request_id)):
         return await _get_request_no_lock_async(request_id)
@@ -704,6 +694,7 @@ def create_if_not_exists(request: Request) -> bool:
 @init_db_async
 @metrics_lib.time_me_async
+@asyncio_utils.shield
 async def create_if_not_exists_async(request: Request) -> bool:
     """Async version of create_if_not_exists."""
     async with filelock.AsyncFileLock(request_lock_path(request.request_id)):

sky/server/requests/threads.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Request execution threads management."""
+import concurrent.futures
+import threading
+from typing import Callable, Set
+from sky import exceptions
+from sky import sky_logging
+from sky.utils import atomic
+logger = sky_logging.init_logger(__name__)
+class OnDemandThreadExecutor(concurrent.futures.Executor):
+    """An executor that creates a new thread for each task and destroys it
+    after the task is completed.
+    Note(dev):
+    We raise an error instead of queuing the request if the limit is reached, so
+    that:
+    1. the request might be handled by other processes that have idle workers
+       upon retry;
+    2. if not, then users can be clearly hinted that they need to scale the API
+       server to support higher concurrency.
+    So this executor is only suitable for carefully selected cases where the
+    error can be properly handled by caller. To make this executor general, we
+    need to support configuring the queuing behavior (exception or queueing).
+    """
+    def __init__(self, name: str, max_workers: int):
+        self.name: str = name
+        self.max_workers: int = max_workers
+        self.running: atomic.AtomicInt = atomic.AtomicInt(0)
+        self._shutdown: bool = False
+        self._shutdown_lock: threading.Lock = threading.Lock()
+        self._threads: Set[threading.Thread] = set()
+        self._threads_lock: threading.Lock = threading.Lock()
+    def _cleanup_thread(self, thread: threading.Thread):
+        with self._threads_lock:
+            self._threads.discard(thread)
+    def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
+                      *args, **kwargs):
+        try:
+            result = fn(*args, **kwargs)
+            fut.set_result(result)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
+            fut.set_exception(e)
+        finally:
+            self.running.decrement()
+            self._cleanup_thread(threading.current_thread())
+    def check_available(self, borrow: bool = False) -> int:
+        """Check if there are available workers.
+        Args:
+            borrow: If True, the caller borrow a worker from the executor.
+                The caller is responsible for returning the worker to the
+                executor after the task is completed.
+        """
+        count = self.running.increment()
+        if count > self.max_workers:
+            self.running.decrement()
+            raise exceptions.ConcurrentWorkerExhaustedError(
+                f'Maximum concurrent workers {self.max_workers} of threads '
+                f'executor [{self.name}] reached')
+        if not borrow:
+            self.running.decrement()
+        return count
+    def submit(self, fn, /, *args, **kwargs):
+        with self._shutdown_lock:
+            if self._shutdown:
+                raise RuntimeError(
+                    'Cannot submit task after executor is shutdown')
+            count = self.check_available(borrow=True)
+            fut: concurrent.futures.Future = concurrent.futures.Future()
+            # Name is assigned for debugging purpose, duplication is fine
+            thread = threading.Thread(target=self._task_wrapper,
+                                      name=f'{self.name}-{count}',
+                                      args=(fn, fut, *args),
+                                      kwargs=kwargs,
+                                      daemon=True)
+            with self._threads_lock:
+                self._threads.add(thread)
+            try:
+                thread.start()
+            except Exception as e:
+                self.running.decrement()
+                self._cleanup_thread(thread)
+                fut.set_exception(e)
+                raise
+            assert thread.ident is not None, 'Thread should be started'
+            return fut
+    def shutdown(self, wait=True):
+        with self._shutdown_lock:
+            self._shutdown = True
+        if not wait:
+            return
+        with self._threads_lock:
+            threads = list(self._threads)
+        for t in threads:
+            t.join()

skypilot-nightly 1.0.0.dev20251014__py3-none-any.whl → 1.0.0.dev20251016__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251014py3-none-any.whl → 1.0.0.dev20251016py3-none-any.whl