PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251015__py3-none-any.whl → 1.0.0.dev20251016__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251015py3-none-any.whl → 1.0.0.dev20251016py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (45) hide show

sky/__init__.py +2 -2
sky/backends/cloud_vm_ray_backend.py +25 -12
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-ac3a34c8f9fef041.js → webpack-66f23594d38c7f16.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/exceptions.py +13 -1
sky/jobs/constants.py +1 -1
sky/jobs/scheduler.py +2 -4
sky/jobs/server/core.py +2 -1
sky/jobs/server/server.py +5 -3
sky/jobs/state.py +12 -6
sky/jobs/utils.py +8 -2
sky/schemas/generated/jobsv1_pb2.py +52 -52
sky/schemas/generated/jobsv1_pb2.pyi +4 -2
sky/serve/server/server.py +1 -0
sky/server/requests/executor.py +51 -15
sky/server/requests/requests.py +1 -0
sky/server/requests/threads.py +106 -0
sky/server/rest.py +36 -18
sky/server/server.py +24 -0
sky/skylet/constants.py +1 -1
sky/skylet/services.py +3 -1
sky/utils/context_utils.py +2 -0
{skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/METADATA +33 -33
{skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/RECORD +45 -44
/sky/dashboard/out/_next/static/{-bih7JVStsXyeasac-dvQ → pbgtEUoCUdmJyLHjgln5A}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{-bih7JVStsXyeasac-dvQ → pbgtEUoCUdmJyLHjgln5A}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251016.dist-info}/top_level.txt +0 -0

sky/schemas/generated/jobsv1_pb2.py CHANGED Viewed

@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xab\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_id\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
   _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
-  _globals['_JOBSTATUS']._serialized_start=2185
-  _globals['_JOBSTATUS']._serialized_end=2454
+  _globals['_JOBSTATUS']._serialized_start=2219
+  _globals['_JOBSTATUS']._serialized_end=2488
   _globals['_ADDJOBREQUEST']._serialized_start=48
   _globals['_ADDJOBREQUEST']._serialized_end=181
   _globals['_ADDJOBRESPONSE']._serialized_start=183
@@ -34,53 +34,53 @@ if not _descriptor._USE_C_DESCRIPTORS:
   _globals['_QUEUEJOBREQUEST']._serialized_start=235
   _globals['_QUEUEJOBREQUEST']._serialized_end=414
   _globals['_MANAGEDJOBINFO']._serialized_start=417
-  _globals['_MANAGEDJOBINFO']._serialized_end=554
-  _globals['_MANAGEDJOBTASK']._serialized_start=556
-  _globals['_MANAGEDJOBTASK']._serialized_end=649
-  _globals['_QUEUEJOBRESPONSE']._serialized_start=651
-  _globals['_QUEUEJOBRESPONSE']._serialized_end=669
-  _globals['_UPDATESTATUSREQUEST']._serialized_start=671
-  _globals['_UPDATESTATUSREQUEST']._serialized_end=692
-  _globals['_UPDATESTATUSRESPONSE']._serialized_start=694
-  _globals['_UPDATESTATUSRESPONSE']._serialized_end=716
-  _globals['_GETJOBQUEUEREQUEST']._serialized_start=718
-  _globals['_GETJOBQUEUEREQUEST']._serialized_end=794
-  _globals['_JOBINFO']._serialized_start=797
-  _globals['_JOBINFO']._serialized_end=1088
-  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1090
-  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1143
-  _globals['_CANCELJOBSREQUEST']._serialized_start=1145
-  _globals['_CANCELJOBSREQUEST']._serialized_end=1239
-  _globals['_CANCELJOBSRESPONSE']._serialized_start=1241
-  _globals['_CANCELJOBSRESPONSE']._serialized_end=1288
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1290
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1320
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1322
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1353
-  _globals['_TAILLOGSREQUEST']._serialized_start=1355
-  _globals['_TAILLOGSREQUEST']._serialized_end=1482
-  _globals['_TAILLOGSRESPONSE']._serialized_start=1484
-  _globals['_TAILLOGSRESPONSE']._serialized_end=1539
-  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1541
-  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1579
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1582
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1746
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1676
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1746
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1748
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1813
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1815
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1868
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1870
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1931
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1933
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1982
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1984
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2027
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2030
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2182
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2133
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2182
-  _globals['_JOBSSERVICE']._serialized_start=2457
-  _globals['_JOBSSERVICE']._serialized_end=3370
+  _globals['_MANAGEDJOBINFO']._serialized_end=588
+  _globals['_MANAGEDJOBTASK']._serialized_start=590
+  _globals['_MANAGEDJOBTASK']._serialized_end=683
+  _globals['_QUEUEJOBRESPONSE']._serialized_start=685
+  _globals['_QUEUEJOBRESPONSE']._serialized_end=703
+  _globals['_UPDATESTATUSREQUEST']._serialized_start=705
+  _globals['_UPDATESTATUSREQUEST']._serialized_end=726
+  _globals['_UPDATESTATUSRESPONSE']._serialized_start=728
+  _globals['_UPDATESTATUSRESPONSE']._serialized_end=750
+  _globals['_GETJOBQUEUEREQUEST']._serialized_start=752
+  _globals['_GETJOBQUEUEREQUEST']._serialized_end=828
+  _globals['_JOBINFO']._serialized_start=831
+  _globals['_JOBINFO']._serialized_end=1122
+  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1124
+  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1177
+  _globals['_CANCELJOBSREQUEST']._serialized_start=1179
+  _globals['_CANCELJOBSREQUEST']._serialized_end=1273
+  _globals['_CANCELJOBSRESPONSE']._serialized_start=1275
+  _globals['_CANCELJOBSRESPONSE']._serialized_end=1322
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1324
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1354
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1356
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1387
+  _globals['_TAILLOGSREQUEST']._serialized_start=1389
+  _globals['_TAILLOGSREQUEST']._serialized_end=1516
+  _globals['_TAILLOGSRESPONSE']._serialized_start=1518
+  _globals['_TAILLOGSRESPONSE']._serialized_end=1573
+  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1575
+  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1613
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1616
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1780
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1710
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1780
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1782
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1847
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1849
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1902
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1904
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1965
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1967
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=2016
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=2018
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2061
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2064
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2216
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2167
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2216
+  _globals['_JOBSSERVICE']._serialized_start=2491
+  _globals['_JOBSSERVICE']._serialized_end=3404
 # @@protoc_insertion_point(module_scope)

sky/schemas/generated/jobsv1_pb2.pyi CHANGED Viewed

@@ -66,18 +66,20 @@ class QueueJobRequest(_message.Message):
     def __init__(self, job_id: _Optional[int] = ..., codegen: _Optional[str] = ..., script_path: _Optional[str] = ..., remote_log_dir: _Optional[str] = ..., managed_job: _Optional[_Union[ManagedJobInfo, _Mapping]] = ...) -> None: ...
 class ManagedJobInfo(_message.Message):
-    __slots__ = ("name", "pool", "workspace", "entrypoint", "tasks")
+    __slots__ = ("name", "pool", "workspace", "entrypoint", "tasks", "user_id")
     NAME_FIELD_NUMBER: _ClassVar[int]
     POOL_FIELD_NUMBER: _ClassVar[int]
     WORKSPACE_FIELD_NUMBER: _ClassVar[int]
     ENTRYPOINT_FIELD_NUMBER: _ClassVar[int]
     TASKS_FIELD_NUMBER: _ClassVar[int]
+    USER_ID_FIELD_NUMBER: _ClassVar[int]
     name: str
     pool: str
     workspace: str
     entrypoint: str
     tasks: _containers.RepeatedCompositeFieldContainer[ManagedJobTask]
-    def __init__(self, name: _Optional[str] = ..., pool: _Optional[str] = ..., workspace: _Optional[str] = ..., entrypoint: _Optional[str] = ..., tasks: _Optional[_Iterable[_Union[ManagedJobTask, _Mapping]]] = ...) -> None: ...
+    user_id: str
+    def __init__(self, name: _Optional[str] = ..., pool: _Optional[str] = ..., workspace: _Optional[str] = ..., entrypoint: _Optional[str] = ..., tasks: _Optional[_Iterable[_Union[ManagedJobTask, _Mapping]]] = ..., user_id: _Optional[str] = ...) -> None: ...
 class ManagedJobTask(_message.Message):
     __slots__ = ("task_id", "name", "resources_str", "metadata_json")

sky/serve/server/server.py CHANGED Viewed

@@ -98,6 +98,7 @@ async def tail_logs(
     request: fastapi.Request, log_body: payloads.ServeLogsBody,
     background_tasks: fastapi.BackgroundTasks
 ) -> fastapi.responses.StreamingResponse:
+    executor.check_request_thread_executor_available()
     request_task = executor.prepare_request(
         request_id=request.state.request_id,
         request_name='serve.logs',

sky/server/requests/executor.py CHANGED Viewed

@@ -48,6 +48,7 @@ from sky.server.requests import payloads
 from sky.server.requests import preconditions
 from sky.server.requests import process
 from sky.server.requests import requests as api_requests
+from sky.server.requests import threads
 from sky.server.requests.queues import local_queue
 from sky.server.requests.queues import mp_queue
 from sky.skylet import constants
@@ -81,23 +82,28 @@ logger = sky_logging.init_logger(__name__)
 # platforms, including macOS.
 multiprocessing.set_start_method('spawn', force=True)
-# Max threads that is equivalent to the number of thread workers in the
-# default thread pool executor of event loop.
-_REQUEST_THREADS_LIMIT = min(32, (os.cpu_count() or 0) + 4)
+# An upper limit of max threads for request execution per server process that
+# unlikely to be reached to allow higher concurrency while still prevent the
+# server process become overloaded.
+_REQUEST_THREADS_LIMIT = 128
 _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
-# A dedicated thread pool executor for synced requests execution in coroutine
-_REQUEST_THREAD_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
+# A dedicated thread pool executor for synced requests execution in coroutine to
+# avoid:
+# 1. blocking the event loop;
+# 2. exhausting the default thread pool executor of event loop;
+_REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
-def get_request_thread_executor() -> concurrent.futures.ThreadPoolExecutor:
+def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
     """Lazy init and return the request thread executor for current process."""
     global _REQUEST_THREAD_EXECUTOR
     if _REQUEST_THREAD_EXECUTOR is not None:
         return _REQUEST_THREAD_EXECUTOR
     with _REQUEST_THREAD_EXECUTOR_LOCK:
         if _REQUEST_THREAD_EXECUTOR is None:
-            _REQUEST_THREAD_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
+            _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
+                name='request_thread_executor',
                 max_workers=_REQUEST_THREADS_LIMIT)
         return _REQUEST_THREAD_EXECUTOR
@@ -561,6 +567,21 @@ class CoroutineTask:
             pass
+def check_request_thread_executor_available() -> None:
+    """Check if the request thread executor is available.
+    This is a best effort check to hint the client to retry other server
+    processes when there is no avaiable thread worker in current one. But
+    a request may pass this check and still cannot get worker on execution
+    time due to race condition. In this case, the client will see a failed
+    request instead of retry.
+    TODO(aylei): this can be refined with a refactor of our coroutine
+    execution flow.
+    """
+    get_request_thread_executor().check_available()
 def execute_request_in_coroutine(
         request: api_requests.Request) -> CoroutineTask:
     """Execute a request in current event loop.
@@ -575,6 +596,18 @@ def execute_request_in_coroutine(
     return CoroutineTask(task)
+def _execute_with_config_override(func: Callable,
+                                  request_body: payloads.RequestBody,
+                                  request_id: str, request_name: str,
+                                  **kwargs) -> Any:
+    """Execute a function with env and config override inside a thread."""
+    # Override the environment and config within this thread's context,
+    # which gets copied when we call to_thread.
+    with override_request_env_and_config(request_body, request_id,
+                                         request_name):
+        return func(**kwargs)
 async def _execute_request_coroutine(request: api_requests.Request):
     """Execute a request in current event loop.
@@ -592,14 +625,17 @@ async def _execute_request_coroutine(request: api_requests.Request):
         request_task.status = api_requests.RequestStatus.RUNNING
     # Redirect stdout and stderr to the request log path.
     original_output = ctx.redirect_log(request.log_path)
-    # Override environment variables that backs env_options.Options
-    # TODO(aylei): compared to process executor, running task in coroutine has
-    # two issues to fix:
-    # 1. skypilot config is not contextual
-    # 2. envs that read directly from os.environ are not contextual
-    ctx.override_envs(request_body.env_vars)
-    fut: asyncio.Future = context_utils.to_thread_with_executor(
-        get_request_thread_executor(), func, **request_body.to_kwargs())
+    try:
+        fut: asyncio.Future = context_utils.to_thread_with_executor(
+            get_request_thread_executor(), _execute_with_config_override, func,
+            request_body, request.request_id, request.name,
+            **request_body.to_kwargs())
+    except Exception as e:  # pylint: disable=broad-except
+        ctx.redirect_log(original_output)
+        api_requests.set_request_failed(request.request_id, e)
+        logger.error(f'Failed to run request {request.request_id} due to '
+                     f'{common_utils.format_exception(e)}')
+        return
     async def poll_task(request_id: str) -> bool:
         req_status = await api_requests.get_request_status_async(request_id)

sky/server/requests/requests.py CHANGED Viewed

@@ -642,6 +642,7 @@ def get_request(request_id: str) -> Optional[Request]:
 @asyncio_utils.shield
 async def get_request_async(request_id: str) -> Optional[Request]:
     """Async version of get_request."""
+    # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
     async with filelock.AsyncFileLock(request_lock_path(request_id)):
         return await _get_request_no_lock_async(request_id)

sky/server/requests/threads.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Request execution threads management."""
+import concurrent.futures
+import threading
+from typing import Callable, Set
+from sky import exceptions
+from sky import sky_logging
+from sky.utils import atomic
+logger = sky_logging.init_logger(__name__)
+class OnDemandThreadExecutor(concurrent.futures.Executor):
+    """An executor that creates a new thread for each task and destroys it
+    after the task is completed.
+    Note(dev):
+    We raise an error instead of queuing the request if the limit is reached, so
+    that:
+    1. the request might be handled by other processes that have idle workers
+       upon retry;
+    2. if not, then users can be clearly hinted that they need to scale the API
+       server to support higher concurrency.
+    So this executor is only suitable for carefully selected cases where the
+    error can be properly handled by caller. To make this executor general, we
+    need to support configuring the queuing behavior (exception or queueing).
+    """
+    def __init__(self, name: str, max_workers: int):
+        self.name: str = name
+        self.max_workers: int = max_workers
+        self.running: atomic.AtomicInt = atomic.AtomicInt(0)
+        self._shutdown: bool = False
+        self._shutdown_lock: threading.Lock = threading.Lock()
+        self._threads: Set[threading.Thread] = set()
+        self._threads_lock: threading.Lock = threading.Lock()
+    def _cleanup_thread(self, thread: threading.Thread):
+        with self._threads_lock:
+            self._threads.discard(thread)
+    def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
+                      *args, **kwargs):
+        try:
+            result = fn(*args, **kwargs)
+            fut.set_result(result)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
+            fut.set_exception(e)
+        finally:
+            self.running.decrement()
+            self._cleanup_thread(threading.current_thread())
+    def check_available(self, borrow: bool = False) -> int:
+        """Check if there are available workers.
+        Args:
+            borrow: If True, the caller borrow a worker from the executor.
+                The caller is responsible for returning the worker to the
+                executor after the task is completed.
+        """
+        count = self.running.increment()
+        if count > self.max_workers:
+            self.running.decrement()
+            raise exceptions.ConcurrentWorkerExhaustedError(
+                f'Maximum concurrent workers {self.max_workers} of threads '
+                f'executor [{self.name}] reached')
+        if not borrow:
+            self.running.decrement()
+        return count
+    def submit(self, fn, /, *args, **kwargs):
+        with self._shutdown_lock:
+            if self._shutdown:
+                raise RuntimeError(
+                    'Cannot submit task after executor is shutdown')
+            count = self.check_available(borrow=True)
+            fut: concurrent.futures.Future = concurrent.futures.Future()
+            # Name is assigned for debugging purpose, duplication is fine
+            thread = threading.Thread(target=self._task_wrapper,
+                                      name=f'{self.name}-{count}',
+                                      args=(fn, fut, *args),
+                                      kwargs=kwargs,
+                                      daemon=True)
+            with self._threads_lock:
+                self._threads.add(thread)
+            try:
+                thread.start()
+            except Exception as e:
+                self.running.decrement()
+                self._cleanup_thread(thread)
+                fut.set_exception(e)
+                raise
+            assert thread.ident is not None, 'Thread should be started'
+            return fut
+    def shutdown(self, wait=True):
+        with self._shutdown_lock:
+            self._shutdown = True
+        if not wait:
+            return
+        with self._threads_lock:
+            threads = list(self._threads)
+        for t in threads:
+            t.join()

sky/server/rest.py CHANGED Viewed

@@ -178,14 +178,16 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
     Notes(dev):
     """
+    def _readable_error_msg(message: str) -> str:
+        return (f'{colorama.Fore.YELLOW}API server is temporarily '
+                f'unavailable: {message}.\nRetrying...'
+                f'{colorama.Style.RESET_ALL}')
     def decorator(func: F) -> F:
         @functools.wraps(func)
         def wrapper(*args, **kwargs) -> Any:
-            msg = (
-                f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
-                'upgrade in progress. Waiting to resume...'
-                f'{colorama.Style.RESET_ALL}')
             backoff = common_utils.Backoff(
                 initial_backoff=initial_backoff,
                 max_backoff_factor=max_backoff_factor)
@@ -203,7 +205,8 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
                         # stop the status spinner before retrying func() to
                         # avoid the status spinner get stuck if the func() runs
                         # for a long time without update status, e.g. sky logs.
-                        with rich_utils.client_status(msg):
+                        with rich_utils.client_status(
+                                _readable_error_msg(e.message)):
                             if time.time() - start_time > max_wait_seconds:
                                 # pylint: disable=line-too-long
                                 raise exceptions.ServerTemporarilyUnavailableError(
@@ -224,14 +227,33 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
 def handle_server_unavailable(response: 'requests.Response') -> None:
-    if response.status_code == 503:
-        # TODO(aylei): Hacky, depends on how nginx controller handles backends
-        # with no ready endpoints. Should use self-defined status code or header
-        # to distinguish retryable server error from general 503 errors.
-        with ux_utils.print_exception_no_traceback():
-            raise exceptions.ServerTemporarilyUnavailableError(
-                'SkyPilot API server is temporarily unavailable. '
-                'Please try again later.')
+    """Handle 503 (Service Unavailable) error
+    The client get 503 error in the following cases:
+    1. The reverse proxy cannot find any ready backend endpoints to serve the
+       request, e.g. when there is and rolling-update.
+    2. The skypilot API server has temporary resource issue, e.g. when the
+       cucurrency of the handling process is exhausted.
+    We expect the caller (CLI or SDK) retry on these cases and show clear wait
+    message to the user to let user decide whether keep waiting or abort the
+    request.
+    """
+    if response.status_code != 503:
+        return
+    # error_msg = 'SkyPilot API server is temporarily unavailable. '
+    error_msg = ''
+    try:
+        response_data = response.json()
+        if 'detail' in response_data:
+            error_msg = response_data['detail']
+    except Exception:  # pylint: disable=broad-except
+        if response.text:
+            error_msg = response.text
+    with ux_utils.print_exception_no_traceback():
+        raise exceptions.ServerTemporarilyUnavailableError(error_msg)
 @_retry_on_server_unavailable()
@@ -310,11 +332,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
         response = await session.request(method, url, **kwargs)
         # Handle server unavailability (503 status) - same as sync version
-        if response.status == 503:
-            with ux_utils.print_exception_no_traceback():
-                raise exceptions.ServerTemporarilyUnavailableError(
-                    'SkyPilot API server is temporarily unavailable. '
-                    'Please try again later.')
+        handle_server_unavailable(response)
         # Set remote API version and version from headers - same as sync version
         remote_api_version = response.headers.get(constants.API_VERSION_HEADER)

sky/server/server.py CHANGED Viewed

@@ -17,6 +17,7 @@ import resource
 import shutil
 import sys
 import threading
+import traceback
 from typing import Dict, List, Literal, Optional, Set, Tuple
 import uuid
 import zipfile
@@ -74,6 +75,7 @@ from sky.utils import dag_utils
 from sky.utils import perf_utils
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
+from sky.utils import ux_utils
 from sky.utils.db import db_utils
 from sky.volumes.server import server as volumes_rest
 from sky.workspaces import server as workspaces_rest
@@ -664,6 +666,25 @@ except Exception:  # pylint: disable=broad-except
     pass  # no issue, we will warn the user later if its too low
+@app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
+def handle_concurrent_worker_exhausted_error(
+        request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
+    del request  # request is not used
+    # Print detailed error message to server log
+    logger.error('Concurrent worker exhausted: '
+                 f'{common_utils.format_exception(e)}')
+    with ux_utils.enable_traceback():
+        logger.error(f'  Traceback: {traceback.format_exc()}')
+    # Return human readable error message to client
+    return fastapi.responses.JSONResponse(
+        status_code=503,
+        content={
+            'detail':
+                ('The server has exhausted its concurrent worker limit. '
+                 'Please try again or scale the server if the load persists.')
+        })
 @app.get('/token')
 async def token(request: fastapi.Request,
                 local_port: Optional[int] = None) -> fastapi.responses.Response:
@@ -1232,6 +1253,7 @@ async def logs(
     # TODO(zhwu): This should wait for the request on the cluster, e.g., async
     # launch, to finish, so that a user does not need to manually pull the
     # request status.
+    executor.check_request_thread_executor_available()
     request_task = executor.prepare_request(
         request_id=request.state.request_id,
         request_name='logs',
@@ -1466,6 +1488,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
         # to avoid storming the DB and CPU in the meantime
         await asyncio.sleep(0.1)
     request_task = await requests_lib.get_request_async(request_id)
+    # TODO(aylei): refine this, /api/get will not be retried and this is
+    # meaningless to retry. It is the original request that should be retried.
     if request_task.should_retry:
         raise fastapi.HTTPException(
             status_code=503, detail=f'Request {request_id!r} should be retried')

sky/skylet/constants.py CHANGED Viewed

@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '22'
+SKYLET_VERSION = '23'
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.

sky/skylet/services.py CHANGED Viewed

@@ -216,10 +216,12 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
                 if pool is not None:
                     pool_hash = serve_state.get_service_hash(pool)
                 # Add the managed job to job queue database.
+                user_id = managed_job.user_id if managed_job.HasField(
+                    'user_id') else None
                 managed_job_state.set_job_info(job_id, managed_job.name,
                                                managed_job.workspace,
                                                managed_job.entrypoint, pool,
-                                               pool_hash)
+                                               pool_hash, user_id)
                 # Set the managed job to PENDING state to make sure that
                 # this managed job appears in the `sky jobs queue`, even
                 # if it needs to wait to be submitted.

sky/utils/context_utils.py CHANGED Viewed

@@ -19,6 +19,8 @@ from sky.utils import subprocess_utils
 StreamHandler = Callable[[IO[Any], IO[Any]], str]
+logger = sky_logging.init_logger(__name__)
 # TODO(aylei): call hijack_sys_attrs() proactivly in module init at server-side
 # once we have context widely adopted.

skypilot-nightly 1.0.0.dev20251015__py3-none-any.whl → 1.0.0.dev20251016__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251015py3-none-any.whl → 1.0.0.dev20251016py3-none-any.whl