skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -23,6 +23,7 @@ import filelock
23
23
  from sky import backends
24
24
  from sky import exceptions
25
25
  from sky import global_user_state
26
+ from sky import resources as resources_lib
26
27
  from sky import sky_logging
27
28
  from sky import skypilot_config
28
29
  from sky.adaptors import common as adaptors_common
@@ -350,6 +351,13 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
350
351
  f'file does not match the pool argument. '
351
352
  f'To fix, add a valid `{field_name}` field.')
352
353
 
354
+ # Validate that pools do not use ordered resources
355
+ if pool and isinstance(task.resources, list):
356
+ with ux_utils.print_exception_no_traceback():
357
+ raise ValueError(
358
+ 'Ordered resources are not supported for pools. '
359
+ 'Use `any_of` instead, or specify a single resource.')
360
+
353
361
  policy_description = ('on-demand'
354
362
  if task.service.dynamic_ondemand_fallback else 'spot')
355
363
  for resource in list(task.resources):
@@ -360,22 +368,6 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
360
368
  f'{sys_name} will replenish preempted spot '
361
369
  f'with {policy_description} instances.')
362
370
 
363
- if pool:
364
- accelerators = set()
365
- for resource in task.resources:
366
- if resource.accelerators is not None:
367
- if isinstance(resource.accelerators, str):
368
- accelerators.add(resource.accelerators)
369
- elif isinstance(resource.accelerators, dict):
370
- accelerators.update(resource.accelerators.keys())
371
- elif isinstance(resource.accelerators, list):
372
- accelerators.update(resource.accelerators)
373
- if len(accelerators) > 1:
374
- with ux_utils.print_exception_no_traceback():
375
- raise ValueError('Heterogeneous clusters are not supported for '
376
- 'pools please specify one accelerator '
377
- 'for all workers.')
378
-
379
371
  # Try to create a spot placer from the task yaml. Check if the task yaml
380
372
  # is valid for spot placer.
381
373
  spot_placer.SpotPlacer.from_task(task.service, task)
@@ -730,7 +722,7 @@ def _get_service_status(
730
722
  for replica_info in record['replica_info']:
731
723
  job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
732
724
  service_name, replica_info['name'])
733
- replica_info['used_by'] = job_ids[0] if job_ids else None
725
+ replica_info['used_by'] = job_ids
734
726
  return record
735
727
 
736
728
 
@@ -810,16 +802,112 @@ def get_ready_replicas(
810
802
  ]
811
803
 
812
804
 
813
- def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
814
- """Get the next available cluster name from idle replicas.
805
+ def _task_fits(task_resources: 'resources_lib.Resources',
806
+ free_resources: 'resources_lib.Resources') -> bool:
807
+ """Check if the task resources fit in the free resources."""
808
+ if not task_resources.less_demanding_than(free_resources,
809
+ check_cloud=False):
810
+ return False
811
+ if task_resources.cpus is not None:
812
+ if (free_resources.cpus is None or
813
+ task_resources.cpus > free_resources.cpus):
814
+ return False
815
+ if task_resources.memory is not None:
816
+ if (free_resources.memory is None or
817
+ task_resources.memory > free_resources.memory):
818
+ return False
819
+ return True
820
+
821
+
822
+ def _is_empty_resource(resource: 'resources_lib.Resources') -> bool:
823
+ # Returns True if this resource object does not specify any resources.
824
+ return (resource.cpus is None and resource.memory is None and
825
+ resource.accelerators is None)
826
+
827
+
828
+ def get_free_worker_resources(
829
+ pool: str) -> Optional[Dict[str, Optional[resources_lib.Resources]]]:
830
+ """Get free resources for each worker in a pool.
831
+
832
+ Args:
833
+ pool: Pool name (service name)
834
+
835
+ Returns:
836
+ Dictionary mapping cluster_name (worker) to free Resources object (or
837
+ None if worker is not available or has no free resources).
838
+ """
839
+
840
+ free_resources: Dict[str, Optional[resources_lib.Resources]] = {}
841
+ replicas = serve_state.get_replica_infos(pool)
842
+
843
+ for replica_info in replicas:
844
+ cluster_name = replica_info.cluster_name
845
+
846
+ # Get cluster handle
847
+ handle = replica_info.handle()
848
+ if handle is None or handle.launched_resources is None:
849
+ free_resources[cluster_name] = None
850
+ continue
851
+
852
+ total_resources = handle.launched_resources
853
+
854
+ # Get job IDs running on this worker
855
+ job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
856
+ pool, cluster_name)
857
+
858
+ if len(job_ids) == 0:
859
+ free_resources[cluster_name] = total_resources
860
+ continue
861
+
862
+ # Get used resources
863
+ # TODO(lloyd): We should batch the database calls here so that we
864
+ # make a single call to get all the used resources for all the jobs.
865
+ used_resources = managed_job_state.get_pool_worker_used_resources(
866
+ set(job_ids))
867
+ if used_resources is None:
868
+ # We failed to get the used resources. We should return None since
869
+ # we can't make any guarantees about what resources are being used.
870
+ logger.warning(
871
+ f'Failed to get used resources for cluster {cluster_name!r}')
872
+ return None
873
+
874
+ if _is_empty_resource(used_resources):
875
+ # We encountered a job that has no resources specified. We
876
+ # will not consider it for resource-aware scheduling so it must
877
+ # be scheduled on its own. To do this we will set the free
878
+ # worker resources to nothing by returning an empty resource
879
+ # object.
880
+ logger.debug(f'Job {job_ids} has no resources specified. '
881
+ 'Skipping resource-aware scheduling for cluster '
882
+ f'{cluster_name!r}')
883
+ free_resources[cluster_name] = resources_lib.Resources()
884
+ else:
885
+ # Calculate free resources using - operator
886
+ free = total_resources - used_resources
887
+ free_resources[cluster_name] = free
888
+
889
+ return free_resources
890
+
891
+
892
+ def get_next_cluster_name(
893
+ service_name: str,
894
+ job_id: int,
895
+ task_resources: Optional[typing.Union[
896
+ 'resources_lib.Resources', typing.Set['resources_lib.Resources'],
897
+ typing.List['resources_lib.Resources']]] = None
898
+ ) -> Optional[str]:
899
+ """Get the next available cluster name from replicas with sufficient
900
+ resources.
815
901
 
816
902
  Args:
817
903
  service_name: The name of the service.
818
- job_id: Optional job ID to associate with the acquired cluster.
819
- If None, a placeholder will be used.
904
+ job_id: Job ID to associate with the acquired cluster.
905
+ task_resources: Optional task resource requirements. If provided, will
906
+ check if resources fit in free worker resources. Can be
907
+ a single Resources object or a set/list of Resources objects.
820
908
 
821
909
  Returns:
822
- The cluster name if an idle replica is found, None otherwise.
910
+ The cluster name if a suitable replica is found, None otherwise.
823
911
  """
824
912
  # Check if service exists
825
913
  service_status = _get_service_status(service_name,
@@ -831,36 +919,126 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
831
919
  if not service_status['pool']:
832
920
  logger.error(f'Service {service_name!r} is not a pool.')
833
921
  return None
922
+
834
923
  with filelock.FileLock(get_service_filelock_path(service_name)):
924
+ free_resources = get_free_worker_resources(service_name)
925
+ logger.debug(f'Free resources: {free_resources!r}')
835
926
  logger.debug(f'Get next cluster name for pool {service_name!r}')
836
927
  ready_replicas = get_ready_replicas(service_name)
928
+
929
+ logger.debug(f'Ready replicas: {ready_replicas!r}')
930
+
837
931
  idle_replicas: List['replica_managers.ReplicaInfo'] = []
838
- for replica_info in ready_replicas:
839
- jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
840
- service_name, replica_info.cluster_name)
841
- # TODO(tian): Make it resources aware. Currently we allow and only
842
- # allow one job per replica. In the following PR, we should:
843
- # i) When the replica is launched with `any_of` resources (
844
- # replicas can have different resources), we should check if
845
- # the resources that jobs require are available on the replica.
846
- # e.g., if a job requires A100:1 on a {L4:1, A100:1} pool, it
847
- # should only goes to replica with A100.
848
- # ii) When a job only requires a subset of the resources on the
849
- # replica, each replica should be able to handle multiple jobs
850
- # at the same time. e.g., if a job requires A100:1 on a A100:8
851
- # pool, it should be able to run 4 jobs at the same time.
852
- if not jobs_on_replica:
853
- idle_replicas.append(replica_info)
932
+
933
+ # If task_resources is provided, use resource-aware scheduling
934
+ # Normalize task_resources to a list
935
+ if isinstance(task_resources, resources_lib.Resources):
936
+ task_resources_list = [task_resources]
937
+ elif isinstance(task_resources, (set, list)):
938
+ task_resources_list = list(task_resources)
939
+ else:
940
+ task_resources_list = []
941
+
942
+ # We should do resource aware scheduling if:
943
+ # 1. There are task resources.
944
+ # 2. The first task resource has some resources listed.
945
+ # 3. There are free resources.
946
+ # 4. Any free resource has some resources listed.
947
+ resource_aware = len(task_resources_list) > 0
948
+ resource_aware = (resource_aware and
949
+ not _is_empty_resource(task_resources_list[0]))
950
+ resource_aware = resource_aware and free_resources is not None
951
+ if free_resources is not None:
952
+ for free_resource in free_resources.values():
953
+ if free_resource is not None and not _is_empty_resource(
954
+ free_resource):
955
+ resource_aware = True
956
+ break
957
+ else:
958
+ resource_aware = False
959
+ else:
960
+ resource_aware = False
961
+
962
+ if resource_aware:
963
+ logger.debug('Doing resource aware scheduling')
964
+ for replica_info in ready_replicas:
965
+ cluster_name = replica_info.cluster_name
966
+ assert free_resources is not None
967
+ free_resources_on_worker = free_resources.get(cluster_name)
968
+ logger.debug(f'Free resources for cluster {cluster_name!r}: '
969
+ f'{free_resources_on_worker!r}')
970
+
971
+ # Skip if worker has no free resources available
972
+ if free_resources_on_worker is None:
973
+ logger.debug(f'Worker {cluster_name!r} has no free '
974
+ 'resources')
975
+ continue
976
+
977
+ # Check if any of the task resource options fit
978
+ fits = False
979
+ for task_res in task_resources_list:
980
+ logger.debug(f'Task resources: {task_res!r}')
981
+ if _task_fits(task_res, free_resources_on_worker):
982
+ logger.debug(f'Task resources {task_res!r} fits'
983
+ ' in free resources '
984
+ f'{free_resources_on_worker!r}')
985
+ fits = True
986
+ break
987
+ else:
988
+ logger.debug(f'Task resources {task_res!r} does not fit'
989
+ ' in free resources '
990
+ f'{free_resources_on_worker!r}')
991
+ if fits:
992
+ idle_replicas.append(replica_info)
993
+ # Also fall back to resource unaware scheduling if no idle replicas are
994
+ # found. This might be because our launched resources were improperly
995
+ # set. If that's the case then jobs will fail to schedule in a resource
996
+ # aware way because one of the resources will be `None` so we can just
997
+ # fallback to 1 job per replica. If we are truly resource bottlenecked
998
+ # then we will see that there are jobs running on the replica and will
999
+ # not schedule another.
1000
+ if len(idle_replicas) == 0:
1001
+ logger.debug('Falling back to resource unaware scheduling')
1002
+ # Fall back to resource unaware scheduling if no task resources
1003
+ # are provided.
1004
+ for replica_info in ready_replicas:
1005
+ jobs_on_replica = (
1006
+ managed_job_state.get_nonterminal_job_ids_by_pool(
1007
+ service_name, replica_info.cluster_name))
1008
+ if not jobs_on_replica:
1009
+ idle_replicas.append(replica_info)
1010
+
854
1011
  if not idle_replicas:
855
1012
  logger.info(f'No idle replicas found for pool {service_name!r}')
856
1013
  return None
857
1014
 
858
1015
  # Select the first idle replica.
859
- # TODO(tian): "Load balancing" policy.
860
1016
  replica_info = idle_replicas[0]
861
1017
  logger.info(f'Selected replica {replica_info.replica_id} with cluster '
862
1018
  f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
863
1019
  f'{service_name!r}')
1020
+
1021
+ # If job has heterogeneous resources (any_of/ordered), update
1022
+ # full_resources to the specific resource that was selected for this
1023
+ # worker. This must happen before releasing the filelock to ensure
1024
+ # atomicity with the scheduling decision.
1025
+ if resource_aware and len(task_resources_list) > 1:
1026
+ assert free_resources is not None
1027
+ free_resources_on_worker = free_resources.get(
1028
+ replica_info.cluster_name)
1029
+ if free_resources_on_worker is not None:
1030
+ # Find which task resource fits on this worker
1031
+ for task_res in task_resources_list:
1032
+ if _task_fits(task_res, free_resources_on_worker):
1033
+ # Update full_resources in database to this specific
1034
+ # resource
1035
+ logger.debug(
1036
+ f'Updating full_resources for job {job_id!r} '
1037
+ f'to selected resource: {task_res!r}')
1038
+ managed_job_state.update_job_full_resources(
1039
+ job_id, task_res.to_yaml_config())
1040
+ break
1041
+
864
1042
  managed_job_state.set_current_cluster_name(job_id,
865
1043
  replica_info.cluster_name)
866
1044
  return replica_info.cluster_name
@@ -1541,7 +1719,21 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
1541
1719
  replica_status = record['status']
1542
1720
  status_str = replica_status.colored_str()
1543
1721
  used_by = record.get('used_by', None)
1544
- used_by_str = str(used_by) if used_by is not None else '-'
1722
+ if used_by is None:
1723
+ used_by_str = '-'
1724
+ elif isinstance(used_by, str):
1725
+ used_by_str = used_by
1726
+ else:
1727
+ if len(used_by) > 2:
1728
+ used_by_str = (
1729
+ f'{used_by[0]}, {used_by[1]}, +{len(used_by) - 2}'
1730
+ ' more')
1731
+ elif len(used_by) == 2:
1732
+ used_by_str = f'{used_by[0]}, {used_by[1]}'
1733
+ elif len(used_by) == 1:
1734
+ used_by_str = str(used_by[0])
1735
+ else:
1736
+ used_by_str = '-'
1545
1737
 
1546
1738
  replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
1547
1739
  'handle']
sky/server/common.py CHANGED
@@ -17,6 +17,7 @@ import time
17
17
  import typing
18
18
  from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
19
19
  Tuple, TypeVar, Union)
20
+ from urllib.request import Request
20
21
  import uuid
21
22
 
22
23
  import cachetools
@@ -147,6 +148,22 @@ def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
147
148
  return cookie_jar
148
149
 
149
150
 
151
+ def get_cookie_header_for_url(url: str) -> Dict[str, str]:
152
+ """Extract Cookie header value from a cookie jar for a specific URL"""
153
+ cookies = get_api_cookie_jar()
154
+ if not cookies:
155
+ return {}
156
+
157
+ # Use urllib Request to do URL-aware cookie filtering
158
+ request = Request(url)
159
+ cookies.add_cookie_header(request)
160
+ cookie_header = request.get_header('Cookie')
161
+
162
+ if cookie_header is None:
163
+ return {}
164
+ return {'Cookie': cookie_header}
165
+
166
+
150
167
  def set_api_cookie_jar(cookie_jar: CookieJar,
151
168
  create_if_not_exists: bool = True) -> None:
152
169
  """Updates the file cookie jar with the given cookie jar."""
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 25
13
+ API_VERSION = 26
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
sky/server/metrics.py CHANGED
@@ -48,10 +48,12 @@ async def gpu_metrics() -> fastapi.Response:
48
48
  all_metrics: List[str] = []
49
49
  successful_contexts = 0
50
50
 
51
+ remote_contexts = [
52
+ context for context in contexts if context != 'in-cluster'
53
+ ]
51
54
  tasks = [
52
55
  asyncio.create_task(metrics_utils.get_metrics_for_context(context))
53
- for context in contexts
54
- if context != 'in-cluster'
56
+ for context in remote_contexts
55
57
  ]
56
58
 
57
59
  results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -59,7 +61,8 @@ async def gpu_metrics() -> fastapi.Response:
59
61
  for i, result in enumerate(results):
60
62
  if isinstance(result, Exception):
61
63
  logger.error(
62
- f'Failed to get metrics for context {contexts[i]}: {result}')
64
+ f'Failed to get metrics for context {remote_contexts[i]}: '
65
+ f'{result}')
63
66
  elif isinstance(result, BaseException):
64
67
  # Avoid changing behavior for non-Exception BaseExceptions
65
68
  # like KeyboardInterrupt/SystemExit: re-raise them.
sky/server/plugins.py CHANGED
@@ -90,11 +90,26 @@ class RBACRule:
90
90
  class BasePlugin(abc.ABC):
91
91
  """Base class for all SkyPilot server plugins."""
92
92
 
93
+ @property
94
+ def name(self) -> Optional[str]:
95
+ """Plugin name for display purposes."""
96
+ return None
97
+
93
98
  @property
94
99
  def js_extension_path(self) -> Optional[str]:
95
100
  """Optional API route to the JavaScript extension to load."""
96
101
  return None
97
102
 
103
+ @property
104
+ def version(self) -> Optional[str]:
105
+ """Plugin version."""
106
+ return None
107
+
108
+ @property
109
+ def commit(self) -> Optional[str]:
110
+ """Plugin git commit hash."""
111
+ return None
112
+
98
113
  @abc.abstractmethod
99
114
  def install(self, extension_context: ExtensionContext):
100
115
  """Hook called by API server to let the plugin install itself."""
@@ -164,6 +179,7 @@ def load_plugins(extension_context: ExtensionContext):
164
179
 
165
180
  for plugin_config in config.get('plugins', []):
166
181
  class_path = plugin_config['class']
182
+ logger.debug(f'Loading plugins: {class_path}')
167
183
  module_path, class_name = class_path.rsplit('.', 1)
168
184
  try:
169
185
  module = importlib.import_module(module_path)
@@ -482,6 +482,7 @@ class VolumeApplyBody(RequestBody):
482
482
  class VolumeDeleteBody(RequestBody):
483
483
  """The request body for the volume delete endpoint."""
484
484
  names: List[str]
485
+ purge: bool = False
485
486
 
486
487
 
487
488
  class VolumeListBody(RequestBody):
@@ -865,3 +866,20 @@ class SlurmGpuAvailabilityRequestBody(RequestBody):
865
866
  """Request body for getting Slurm real-time GPU availability."""
866
867
  name_filter: Optional[str] = None
867
868
  quantity_filter: Optional[int] = None
869
+
870
+
871
+ class ClusterEventsBody(RequestBody):
872
+ """The request body for the cluster events endpoint."""
873
+ cluster_name: Optional[str] = None
874
+ cluster_hash: Optional[str] = None
875
+ event_type: str # 'STATUS_CHANGE' or 'DEBUG'
876
+ include_timestamps: bool = False
877
+ limit: Optional[
878
+ int] = None # If specified, returns at most this many events
879
+
880
+
881
+ class GetJobEventsBody(RequestBody):
882
+ """The request body for the get job task events endpoint."""
883
+ job_id: int
884
+ task_id: Optional[int] = None
885
+ limit: Optional[int] = 10 # Default to 10 most recent task events
@@ -31,6 +31,7 @@ class RequestName(str, enum.Enum):
31
31
  CLUSTER_JOB_LOGS = 'logs'
32
32
  CLUSTER_JOB_DOWNLOAD_LOGS = 'download_logs'
33
33
  CLUSTER_COST_REPORT = 'cost_report'
34
+ CLUSTER_EVENTS = 'cluster_events'
34
35
  # Storage requests
35
36
  STORAGE_LS = 'storage_ls'
36
37
  STORAGE_DELETE = 'storage_delete'
@@ -52,6 +53,7 @@ class RequestName(str, enum.Enum):
52
53
  JOBS_POOL_STATUS = 'jobs.pool_status'
53
54
  JOBS_POOL_LOGS = 'jobs.pool_logs'
54
55
  JOBS_POOL_SYNC_DOWN_LOGS = 'jobs.pool_sync_down_logs'
56
+ JOBS_EVENTS = 'jobs.events'
55
57
  # Serve requests
56
58
  SERVE_UP = 'serve.up'
57
59
  SERVE_UPDATE = 'serve.update'
@@ -873,11 +873,17 @@ async def create_if_not_exists_async(request: Request) -> bool:
873
873
  f'({request_columns}) VALUES '
874
874
  f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
875
875
  request_row = request.to_row()
876
- # Execute the SQL statement without getting the request lock.
877
- # The request lock is used to prevent racing with cancellation codepath,
878
- # but a request cannot be cancelled before it is created.
879
- row = await _DB.execute_get_returning_value_async(sql_statement,
880
- request_row)
876
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
877
+ logger.debug(f'Start creating request {request.request_id}')
878
+ try:
879
+ # Execute the SQL statement without getting the request lock.
880
+ # The request lock is used to prevent racing with cancellation codepath,
881
+ # but a request cannot be cancelled before it is created.
882
+ row = await _DB.execute_get_returning_value_async(
883
+ sql_statement, request_row)
884
+ finally:
885
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
886
+ logger.debug(f'End creating request {request.request_id}')
881
887
  return True if row else False
882
888
 
883
889
 
@@ -1034,9 +1040,15 @@ _add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
1034
1040
  def _add_or_update_request_no_lock(request: Request):
1035
1041
  """Add or update a REST request into the database."""
1036
1042
  assert _DB is not None
1037
- with _DB.conn:
1038
- cursor = _DB.conn.cursor()
1039
- cursor.execute(_add_or_update_request_sql, request.to_row())
1043
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1044
+ logger.debug(f'Start adding or updating request {request.request_id}')
1045
+ try:
1046
+ with _DB.conn:
1047
+ cursor = _DB.conn.cursor()
1048
+ cursor.execute(_add_or_update_request_sql, request.to_row())
1049
+ finally:
1050
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1051
+ logger.debug(f'End adding or updating request {request.request_id}')
1040
1052
 
1041
1053
 
1042
1054
  async def _add_or_update_request_no_lock_async(request: Request):
@@ -1125,8 +1137,14 @@ async def _delete_requests(request_ids: List[str]):
1125
1137
  """Clean up requests by their IDs."""
1126
1138
  id_list_str = ','.join(repr(request_id) for request_id in request_ids)
1127
1139
  assert _DB is not None
1128
- await _DB.execute_and_commit_async(
1129
- f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
1140
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1141
+ logger.debug(f'Start deleting requests {request_ids}')
1142
+ try:
1143
+ await _DB.execute_and_commit_async(
1144
+ f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
1145
+ finally:
1146
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1147
+ logger.debug(f'End deleting requests {request_ids}')
1130
1148
 
1131
1149
 
1132
1150
  async def clean_finished_requests_with_retention(retention_seconds: int,
@@ -69,6 +69,11 @@ def encode_status(
69
69
  response_cluster['last_use'] = ''
70
70
  if 'status_updated_at' not in response_cluster:
71
71
  response_cluster['status_updated_at'] = 0
72
+ # Ensure labels is always included, defaulting to empty dict if None
73
+ # This is needed because exclude_none=True would exclude None labels
74
+ if 'labels' not in response_cluster or response_cluster.get(
75
+ 'labels') is None:
76
+ response_cluster['labels'] = {}
72
77
  response_cluster['status'] = cluster['status'].value
73
78
  handle = serialize_utils.prepare_handle_for_backwards_compatibility(
74
79
  cluster['handle'])
@@ -50,11 +50,21 @@ def serialize_kubernetes_node_info(return_value: Dict[str, Any]) -> str:
50
50
 
51
51
  The is_ready field was added in API version 25. Remove it for old clients
52
52
  that don't recognize it.
53
+ The cpu_count, memory_gb, cpu_free, and memory_free_gb fields were added
54
+ in API version 26. Remove them for old clients that don't recognize them.
53
55
  """
54
56
  remote_api_version = versions.get_remote_api_version()
55
- if (return_value and remote_api_version is not None and
56
- remote_api_version < 25):
57
- # Remove is_ready field for old clients that don't recognize it
57
+ if (return_value and remote_api_version is not None):
58
58
  for node_info in return_value.get('node_info_dict', {}).values():
59
- node_info.pop('is_ready', None)
59
+ if remote_api_version < 25:
60
+ # Remove is_ready field for old clients that don't recognize it
61
+ node_info.pop('is_ready', None)
62
+ if remote_api_version < 26:
63
+ # Remove cpu_count, memory_gb, cpu_free, and
64
+ # memory_free_gb fields for old clients that don't
65
+ # recognize them
66
+ node_info.pop('cpu_count', None)
67
+ node_info.pop('memory_gb', None)
68
+ node_info.pop('cpu_free', None)
69
+ node_info.pop('memory_free_gb', None)
60
70
  return orjson.dumps(return_value).decode('utf-8')