skypilot-nightly 1.0.0.dev20250627__py3-none-any.whl → 1.0.0.dev20250630__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +14 -0
  3. sky/adaptors/nebius.py +2 -2
  4. sky/authentication.py +12 -5
  5. sky/backends/backend_utils.py +92 -26
  6. sky/check.py +5 -2
  7. sky/client/cli/command.py +39 -8
  8. sky/client/sdk.py +217 -167
  9. sky/client/service_account_auth.py +47 -0
  10. sky/clouds/aws.py +10 -4
  11. sky/clouds/azure.py +5 -2
  12. sky/clouds/cloud.py +5 -2
  13. sky/clouds/gcp.py +31 -18
  14. sky/clouds/kubernetes.py +54 -34
  15. sky/clouds/nebius.py +8 -2
  16. sky/clouds/ssh.py +5 -2
  17. sky/clouds/utils/aws_utils.py +10 -4
  18. sky/clouds/utils/gcp_utils.py +22 -7
  19. sky/clouds/utils/oci_utils.py +62 -14
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/NdypbqMxaYucRGfopkKXa/_buildManifest.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/1043-1b39779691bb4030.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/{141-fa5a20cbf401b351.js → 1141-726e5a3f00b67185.js} +2 -2
  24. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/1691.44e378727a41f3b5.js +21 -0
  27. sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +6 -0
  28. sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/{875.52c962183328b3f2.js → 2875.c24c6d57dc82e436.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/3256.7257acd01b481bed.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/3698-52ad1ca228faa776.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/3785.b3cc2bc1d49d2c3c.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/{947-6620842ef80ae879.js → 3947-b059261d6fa88a1f.js} +1 -1
  35. sky/dashboard/out/_next/static/chunks/{697.6460bf72e760addd.js → 4697.f5421144224da9fc.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/4725.4c849b1e05c8e9ad.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{491.b3d264269613fe09.js → 5491.918ffed0ba7a5294.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +8 -0
  40. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +39 -0
  41. sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/6989-6ff4e45dfb49d11d.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/6990-d0dc765474fa0eca.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/8982.a2e214068f30a857.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/{25.76c246239df93d50.js → 9025.a7c44babfe56ce09.js} +2 -2
  47. sky/dashboard/out/_next/static/chunks/938-044ad21de8b4626b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/9470-21d059a1dfa03f61.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{framework-87d061ee6ed71b28.js → framework-efc06c2733009cd3.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/{main-e0e2335212e72357.js → main-c0a4f1ea606d48d2.js} +1 -1
  54. sky/dashboard/out/_next/static/chunks/pages/{_app-9a3ce3170d2edcec.js → _app-a37b06ddb64521fd.js} +2 -2
  55. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +6 -0
  57. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters-9744c271a1642f76.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +16 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/users-cd43fb3c122eedde.js +1 -0
  66. sky/dashboard/out/_next/static/chunks/pages/volumes-4ebf6484f7216387.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/workspaces-06bde99155fa6292.js +1 -0
  70. sky/dashboard/out/_next/static/chunks/webpack-d427db53e54de9ce.js +1 -0
  71. sky/dashboard/out/_next/static/css/0da6afe66176678a.css +3 -0
  72. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  73. sky/dashboard/out/clusters/[cluster].html +1 -1
  74. sky/dashboard/out/clusters.html +1 -1
  75. sky/dashboard/out/config.html +1 -1
  76. sky/dashboard/out/index.html +1 -1
  77. sky/dashboard/out/infra/[context].html +1 -1
  78. sky/dashboard/out/infra.html +1 -1
  79. sky/dashboard/out/jobs/[job].html +1 -1
  80. sky/dashboard/out/jobs.html +1 -1
  81. sky/dashboard/out/users.html +1 -1
  82. sky/dashboard/out/volumes.html +1 -1
  83. sky/dashboard/out/workspace/new.html +1 -1
  84. sky/dashboard/out/workspaces/[name].html +1 -1
  85. sky/dashboard/out/workspaces.html +1 -1
  86. sky/data/storage.py +8 -3
  87. sky/global_user_state.py +257 -9
  88. sky/jobs/client/sdk.py +20 -25
  89. sky/models.py +16 -0
  90. sky/optimizer.py +46 -0
  91. sky/provision/__init__.py +14 -6
  92. sky/provision/kubernetes/config.py +1 -1
  93. sky/provision/kubernetes/constants.py +9 -0
  94. sky/provision/kubernetes/instance.py +24 -18
  95. sky/provision/kubernetes/network.py +15 -9
  96. sky/provision/kubernetes/network_utils.py +42 -23
  97. sky/provision/kubernetes/utils.py +73 -35
  98. sky/provision/kubernetes/volume.py +77 -15
  99. sky/provision/nebius/utils.py +10 -4
  100. sky/resources.py +10 -4
  101. sky/serve/client/sdk.py +28 -34
  102. sky/server/common.py +51 -3
  103. sky/server/constants.py +3 -0
  104. sky/server/requests/executor.py +4 -0
  105. sky/server/requests/payloads.py +33 -0
  106. sky/server/requests/requests.py +19 -0
  107. sky/server/rest.py +6 -15
  108. sky/server/server.py +121 -6
  109. sky/skylet/constants.py +7 -0
  110. sky/skypilot_config.py +32 -4
  111. sky/task.py +12 -0
  112. sky/users/permission.py +29 -0
  113. sky/users/server.py +384 -5
  114. sky/users/token_service.py +196 -0
  115. sky/utils/common_utils.py +4 -5
  116. sky/utils/config_utils.py +41 -0
  117. sky/utils/controller_utils.py +5 -1
  118. sky/utils/log_utils.py +68 -0
  119. sky/utils/resource_checker.py +153 -0
  120. sky/utils/resources_utils.py +12 -4
  121. sky/utils/schemas.py +87 -60
  122. sky/utils/subprocess_utils.py +2 -6
  123. sky/volumes/server/core.py +103 -78
  124. sky/volumes/utils.py +22 -5
  125. sky/workspaces/core.py +9 -117
  126. {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/METADATA +1 -1
  127. {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/RECORD +133 -128
  128. sky/dashboard/out/_next/static/HudU4f4Xsy-cP51JvXSZ-/_buildManifest.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +0 -39
  134. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +0 -16
  137. sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +0 -8
  140. sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +0 -6
  141. sky/dashboard/out/_next/static/chunks/937.3759f538f11a0953.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +0 -1
  146. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +0 -1
  147. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +0 -6
  153. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +0 -6
  154. sky/dashboard/out/_next/static/chunks/pages/clusters-f119a5630a1efd61.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +0 -1
  158. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +0 -16
  160. sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +0 -1
  163. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5b59bce9eb208d84.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +0 -1
  166. sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +0 -1
  167. sky/dashboard/out/_next/static/css/52082cf558ec9705.css +0 -3
  168. /sky/dashboard/out/_next/static/{HudU4f4Xsy-cP51JvXSZ- → NdypbqMxaYucRGfopkKXa}/_ssgManifest.js +0 -0
  169. /sky/dashboard/out/_next/static/chunks/{804-4c9fc53aa74bc191.js → 804-9f5e98ce84d46bdd.js} +0 -0
  170. {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/licenses/LICENSE +0 -0
  173. {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,12 @@
1
1
  """Kubernetes pvc provisioning."""
2
2
  from typing import Any, Dict, List, Optional, Tuple
3
3
 
4
+ from sky import global_user_state
4
5
  from sky import models
5
6
  from sky import sky_logging
6
7
  from sky.adaptors import kubernetes
7
8
  from sky.provision.kubernetes import config as config_lib
9
+ from sky.provision.kubernetes import constants as k8s_constants
8
10
  from sky.provision.kubernetes import utils as kubernetes_utils
9
11
  from sky.volumes import volume as volume_lib
10
12
 
@@ -45,17 +47,26 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
45
47
  access_mode = pvc.spec.access_modes[0]
46
48
  if access_mode not in once_modes:
47
49
  continue
48
- usedby = _get_volume_usedby(context, namespace, pvc_name)
49
- if usedby:
50
+ usedby_pods, _ = _get_volume_usedby(context, namespace, pvc_name)
51
+ if usedby_pods:
50
52
  raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
51
53
  f'mode {access_mode} is already '
52
- f'in use by {usedby}.')
54
+ f'in use by Pods {usedby_pods}.')
53
55
 
54
56
 
55
57
  def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
56
58
  """Creates or registers a volume."""
57
59
  context, namespace = _get_context_namespace(config)
58
60
  pvc_spec = _get_pvc_spec(namespace, config)
61
+ # Check if the storage class exists
62
+ storage_class_name = pvc_spec['spec'].get('storageClassName')
63
+ if storage_class_name is not None:
64
+ try:
65
+ kubernetes.storage_api(context).read_storage_class(
66
+ name=storage_class_name)
67
+ except kubernetes.api_exception() as e:
68
+ raise config_lib.KubernetesError(
69
+ f'Check storage class {storage_class_name} error: {e}')
59
70
  create_persistent_volume_claim(namespace, context, pvc_spec)
60
71
  return config
61
72
 
@@ -76,22 +87,73 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
76
87
  return config
77
88
 
78
89
 
79
- def _get_volume_usedby(context: Optional[str], namespace: str,
80
- pvc_name: str) -> List[str]:
81
- """Gets the usedby resources of a volume."""
82
- usedby = []
90
+ def _get_volume_usedby(
91
+ context: Optional[str],
92
+ namespace: str,
93
+ pvc_name: str,
94
+ ) -> Tuple[List[str], List[str]]:
95
+ """Gets the usedby resources of a volume.
96
+
97
+ This function returns the pods and clusters that are using the volume.
98
+ The usedby_pods is accurate, which also includes the Pods that are not
99
+ managed by SkyPilot.
100
+
101
+ Args:
102
+ context: Kubernetes context
103
+ namespace: Kubernetes namespace
104
+ pvc_name: PVC name
105
+
106
+ Returns:
107
+ usedby_pods: List of pods using the volume. These may include pods
108
+ not created by SkyPilot.
109
+ usedby_clusters: List of clusters using the volume.
110
+ """
111
+ usedby_pods = []
112
+ usedby_clusters = []
113
+ field_selector = ','.join([
114
+ f'status.phase!={phase}'
115
+ for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
116
+ ])
117
+ cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
83
118
  # Get all pods in the namespace
84
- pods = kubernetes.core_api(context).list_namespaced_pod(namespace=namespace)
119
+ pods = kubernetes.core_api(context).list_namespaced_pod(
120
+ namespace=namespace, field_selector=field_selector)
85
121
  for pod in pods.items:
86
- if pod.spec.volumes is not None:
87
- for volume in pod.spec.volumes:
88
- if volume.persistent_volume_claim is not None:
89
- if volume.persistent_volume_claim.claim_name == pvc_name:
90
- usedby.append(pod.metadata.name)
91
- return usedby
122
+ if pod.spec.volumes is None:
123
+ continue
124
+ for volume in pod.spec.volumes:
125
+ if volume.persistent_volume_claim is None:
126
+ continue
127
+ if volume.persistent_volume_claim.claim_name == pvc_name:
128
+ usedby_pods.append(pod.metadata.name)
129
+ # Get the real cluster name
130
+ cluster_name_on_cloud = pod.metadata.labels.get(
131
+ k8s_constants.TAG_SKYPILOT_CLUSTER_NAME)
132
+ if cluster_name_on_cloud is None:
133
+ continue
134
+ cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
135
+ if cluster_name is not None:
136
+ usedby_clusters.append(cluster_name)
137
+ if usedby_pods:
138
+ logger.debug(f'Volume {pvc_name} is used by Pods {usedby_pods}'
139
+ f' and clusters {usedby_clusters}')
140
+ return usedby_pods, usedby_clusters
141
+
142
+
143
+ def _get_cluster_name_on_cloud_to_cluster_name_map() -> Dict[str, str]:
144
+ """Gets the map from cluster name on cloud to cluster name."""
145
+ clusters = global_user_state.get_clusters()
146
+ cloud_to_name_map = {}
147
+ for cluster in clusters:
148
+ handle = cluster['handle']
149
+ if handle is None:
150
+ continue
151
+ cloud_to_name_map[handle.cluster_name_on_cloud] = cluster['name']
152
+ return cloud_to_name_map
92
153
 
93
154
 
94
- def get_volume_usedby(config: models.VolumeConfig) -> List[str]:
155
+ def get_volume_usedby(
156
+ config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
95
157
  """Gets the usedby resources of a volume."""
96
158
  context, namespace = _get_context_namespace(config)
97
159
  pvc_name = config.name_on_cloud
@@ -40,8 +40,11 @@ def get_project_by_region(region: str) -> str:
40
40
  parent_id=nebius.get_tenant_id())).wait()
41
41
 
42
42
  # Check is there project if in config
43
- project_id = skypilot_config.get_nested(('nebius', region, 'project_id'),
44
- None)
43
+ project_id = skypilot_config.get_effective_region_config(
44
+ cloud='nebius',
45
+ region=None,
46
+ keys=(region, 'project_id'),
47
+ default_value=None)
45
48
  if project_id is not None:
46
49
  return project_id
47
50
  for project in projects.items:
@@ -184,8 +187,11 @@ def launch(cluster_name_on_cloud: str,
184
187
  # https://docs.nebius.com/compute/clusters/gpu
185
188
  if platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS:
186
189
  if preset == '8gpu-128vcpu-1600gb':
187
- fabric = skypilot_config.get_nested(('nebius', region, 'fabric'),
188
- None)
190
+ fabric = skypilot_config.get_effective_region_config(
191
+ cloud='nebius',
192
+ region=None,
193
+ keys=(region, 'fabric'),
194
+ default_value=None)
189
195
 
190
196
  # Auto-select fabric if network_tier=best and no fabric configured
191
197
  if (fabric is None and
sky/resources.py CHANGED
@@ -1064,8 +1064,11 @@ class Resources:
1064
1064
  regions = [r for r in regions if r.name in self._image_id]
1065
1065
 
1066
1066
  # Filter the regions by the skypilot_config
1067
- ssh_proxy_command_config = skypilot_config.get_nested(
1068
- (str(self._cloud).lower(), 'ssh_proxy_command'), None)
1067
+ ssh_proxy_command_config = skypilot_config.get_effective_region_config(
1068
+ cloud=str(self._cloud).lower(),
1069
+ region=None,
1070
+ keys=('ssh_proxy_command',),
1071
+ default_value=None)
1069
1072
  if (isinstance(ssh_proxy_command_config, str) or
1070
1073
  ssh_proxy_command_config is None):
1071
1074
  # All regions are valid as the regions are not specified for the
@@ -1550,8 +1553,11 @@ class Resources:
1550
1553
  # to each cloud if any cloud supports reservations for spot.
1551
1554
  return {}
1552
1555
  specific_reservations = set(
1553
- skypilot_config.get_nested(
1554
- (str(self.cloud).lower(), 'specific_reservations'), set()))
1556
+ skypilot_config.get_effective_region_config(
1557
+ cloud=str(self.cloud).lower(),
1558
+ region=self.region,
1559
+ keys=('specific_reservations',),
1560
+ default_value=set()))
1555
1561
 
1556
1562
  if isinstance(self.cloud, clouds.DummyCloud):
1557
1563
  return self.cloud.get_reservations_available_resources(
sky/serve/client/sdk.py CHANGED
@@ -74,12 +74,11 @@ def up(
74
74
  task=dag_str,
75
75
  service_name=service_name,
76
76
  )
77
- response = rest.post(
78
- f'{server_common.get_server_url()}/serve/up',
77
+ response = server_common.make_authenticated_request(
78
+ 'POST',
79
+ '/serve/up',
79
80
  json=json.loads(body.model_dump_json()),
80
- timeout=(5, None),
81
- cookies=server_common.get_api_cookie_jar(),
82
- )
81
+ timeout=(5, None))
83
82
  return server_common.get_request_id(response)
84
83
 
85
84
 
@@ -136,12 +135,11 @@ def update(
136
135
  mode=mode,
137
136
  )
138
137
 
139
- response = rest.post(
140
- f'{server_common.get_server_url()}/serve/update',
138
+ response = server_common.make_authenticated_request(
139
+ 'POST',
140
+ '/serve/update',
141
141
  json=json.loads(body.model_dump_json()),
142
- timeout=(5, None),
143
- cookies=server_common.get_api_cookie_jar(),
144
- )
142
+ timeout=(5, None))
145
143
  return server_common.get_request_id(response)
146
144
 
147
145
 
@@ -178,12 +176,11 @@ def down(
178
176
  all=all,
179
177
  purge=purge,
180
178
  )
181
- response = rest.post(
182
- f'{server_common.get_server_url()}/serve/down',
179
+ response = server_common.make_authenticated_request(
180
+ 'POST',
181
+ '/serve/down',
183
182
  json=json.loads(body.model_dump_json()),
184
- timeout=(5, None),
185
- cookies=server_common.get_api_cookie_jar(),
186
- )
183
+ timeout=(5, None))
187
184
  return server_common.get_request_id(response)
188
185
 
189
186
 
@@ -213,12 +210,11 @@ def terminate_replica(service_name: str, replica_id: int,
213
210
  replica_id=replica_id,
214
211
  purge=purge,
215
212
  )
216
- response = rest.post(
217
- f'{server_common.get_server_url()}/serve/terminate-replica',
213
+ response = server_common.make_authenticated_request(
214
+ 'POST',
215
+ '/serve/terminate-replica',
218
216
  json=json.loads(body.model_dump_json()),
219
- timeout=(5, None),
220
- cookies=server_common.get_api_cookie_jar(),
221
- )
217
+ timeout=(5, None))
222
218
  return server_common.get_request_id(response)
223
219
 
224
220
 
@@ -286,12 +282,11 @@ def status(
286
282
  exceptions.ClusterNotUpError: if the sky serve controller is not up.
287
283
  """
288
284
  body = payloads.ServeStatusBody(service_names=service_names,)
289
- response = rest.post(
290
- f'{server_common.get_server_url()}/serve/status',
285
+ response = server_common.make_authenticated_request(
286
+ 'POST',
287
+ '/serve/status',
291
288
  json=json.loads(body.model_dump_json()),
292
- timeout=(5, None),
293
- cookies=server_common.get_api_cookie_jar(),
294
- )
289
+ timeout=(5, None))
295
290
  return server_common.get_request_id(response)
296
291
 
297
292
 
@@ -373,13 +368,12 @@ def tail_logs(service_name: str,
373
368
  replica_id=replica_id,
374
369
  follow=follow,
375
370
  )
376
- response = rest.post(
377
- f'{server_common.get_server_url()}/serve/logs',
371
+ response = server_common.make_authenticated_request(
372
+ 'POST',
373
+ '/serve/logs',
378
374
  json=json.loads(body.model_dump_json()),
379
375
  timeout=(5, None),
380
- stream=True,
381
- cookies=server_common.get_api_cookie_jar(),
382
- )
376
+ stream=True)
383
377
  request_id = server_common.get_request_id(response)
384
378
  return sdk.stream_response(request_id=request_id,
385
379
  response=response,
@@ -436,11 +430,11 @@ def sync_down_logs(service_name: str,
436
430
  targets=targets,
437
431
  replica_ids=replica_ids,
438
432
  )
439
- response = rest.post(
440
- f'{server_common.get_server_url()}/serve/sync-down-logs',
433
+ response = server_common.make_authenticated_request(
434
+ 'POST',
435
+ '/serve/sync-down-logs',
441
436
  json=json.loads(body.model_dump_json()),
442
- timeout=(5, None),
443
- )
437
+ timeout=(5, None))
444
438
  remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
445
439
 
446
440
  # Download from API server paths to the client's local_dir
sky/server/common.py CHANGED
@@ -27,6 +27,7 @@ from sky import exceptions
27
27
  from sky import sky_logging
28
28
  from sky import skypilot_config
29
29
  from sky.adaptors import common as adaptors_common
30
+ from sky.client import service_account_auth
30
31
  from sky.data import data_utils
31
32
  from sky.server import constants as server_constants
32
33
  from sky.server import rest
@@ -185,6 +186,53 @@ def get_cookies_from_response(
185
186
  return cookies
186
187
 
187
188
 
189
+ def make_authenticated_request(method: str,
190
+ path: str,
191
+ server_url: Optional[str] = None,
192
+ retry: bool = True,
193
+ **kwargs) -> 'requests.Response':
194
+ """Make an authenticated HTTP request to the API server.
195
+
196
+ Automatically handles service account token authentication or cookie-based
197
+ authentication based on what's available.
198
+
199
+ Args:
200
+ method: HTTP method (GET, POST, etc.)
201
+ path: API path (e.g., '/api/v1/status')
202
+ server_url: Server URL, defaults to configured server
203
+ **kwargs: Additional arguments to pass to requests
204
+
205
+ Returns:
206
+ requests.Response object
207
+ """
208
+ if server_url is None:
209
+ server_url = get_server_url()
210
+
211
+ # Prepare headers and URL for service account authentication
212
+ headers = service_account_auth.get_service_account_headers()
213
+
214
+ # Merge with existing headers
215
+ if 'headers' in kwargs:
216
+ headers.update(kwargs['headers'])
217
+ kwargs['headers'] = headers
218
+
219
+ # Always use the same URL regardless of authentication type
220
+ # OAuth2 proxy will handle authentication based on headers
221
+ url = f'{server_url}/{path}' if not path.startswith(
222
+ '/') else f'{server_url}{path}'
223
+
224
+ # Use cookie authentication if no Bearer token present
225
+ if not headers.get('Authorization') and 'cookies' not in kwargs:
226
+ kwargs['cookies'] = get_api_cookie_jar()
227
+
228
+ # Make the request
229
+ if retry:
230
+ return rest.request(method, url, **kwargs)
231
+ else:
232
+ assert method == 'GET', 'Only GET requests can be done without retry'
233
+ return rest.request_without_retry(method, url, **kwargs)
234
+
235
+
188
236
  @annotations.lru_cache(scope='global')
189
237
  def get_server_url(host: Optional[str] = None) -> str:
190
238
  endpoint = DEFAULT_SERVER_URL
@@ -243,9 +291,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
243
291
  server_url = endpoint if endpoint is not None else get_server_url()
244
292
  while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
245
293
  try:
246
- response = rest.get(f'{server_url}/api/health',
247
- timeout=2.5,
248
- cookies=get_api_cookie_jar())
294
+ response = make_authenticated_request('GET',
295
+ '/api/health',
296
+ timeout=2.5)
249
297
  except requests.exceptions.Timeout:
250
298
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
251
299
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
sky/server/constants.py CHANGED
@@ -36,3 +36,6 @@ API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
36
36
  # The path to the dashboard build output
37
37
  DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
38
38
  'out')
39
+
40
+ # The interval (seconds) for the event to be restarted in the background.
41
+ DAEMON_RESTART_INTERVAL_SECONDS = 20
@@ -268,6 +268,10 @@ def override_request_env_and_config(
268
268
  user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
269
269
  name=request_body.env_vars[constants.USER_ENV_VAR])
270
270
  global_user_state.add_or_update_user(user)
271
+ # Refetch the user to get the latest user info, including the created_at
272
+ # field.
273
+ user = global_user_state.get_user(user.id)
274
+
271
275
  # Force color to be enabled.
272
276
  os.environ['CLICOLOR_FORCE'] = '1'
273
277
  server_common.reload_for_new_request(
@@ -358,6 +358,39 @@ class UserImportBody(RequestBody):
358
358
  csv_content: str
359
359
 
360
360
 
361
+ class ServiceAccountTokenCreateBody(RequestBody):
362
+ """The request body for creating a service account token."""
363
+ token_name: str
364
+ expires_in_days: Optional[int] = None
365
+
366
+
367
+ class ServiceAccountTokenDeleteBody(RequestBody):
368
+ """The request body for deleting a service account token."""
369
+ token_id: str
370
+
371
+
372
+ class UpdateRoleBody(RequestBody):
373
+ """The request body for updating a user role."""
374
+ role: str
375
+
376
+
377
+ class ServiceAccountTokenRoleBody(RequestBody):
378
+ """The request body for getting a service account token role."""
379
+ token_id: str
380
+
381
+
382
+ class ServiceAccountTokenUpdateRoleBody(RequestBody):
383
+ """The request body for updating a service account token role."""
384
+ token_id: str
385
+ role: str
386
+
387
+
388
+ class ServiceAccountTokenRotateBody(RequestBody):
389
+ """The request body for rotating a service account token."""
390
+ token_id: str
391
+ expires_in_days: Optional[int] = None
392
+
393
+
361
394
  class DownloadBody(RequestBody):
362
395
  """The request body for the download endpoint."""
363
396
  folder_paths: List[str]
@@ -375,10 +375,29 @@ def managed_job_status_refresh_event():
375
375
 
376
376
  @dataclasses.dataclass
377
377
  class InternalRequestDaemon:
378
+ """Internal daemon that runs an event in the background."""
379
+
378
380
  id: str
379
381
  name: str
380
382
  event_fn: Callable[[], None]
381
383
 
384
+ def run_event(self):
385
+ """Run the event."""
386
+ while True:
387
+ with ux_utils.enable_traceback():
388
+ try:
389
+ self.event_fn()
390
+ break
391
+ except Exception: # pylint: disable=broad-except
392
+ # It is OK to fail to run the event, as the event is not
393
+ # critical, but we should log the error.
394
+ logger.exception(
395
+ f'Error running {self.name} event. '
396
+ f'Restarting in '
397
+ f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
398
+ 'seconds...')
399
+ time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
400
+
382
401
 
383
402
  # Register the events to run in the background.
384
403
  INTERNAL_REQUEST_DAEMONS = [
sky/server/rest.py CHANGED
@@ -129,25 +129,16 @@ def handle_server_unavailable(response: 'requests.Response') -> None:
129
129
 
130
130
 
131
131
  @retry_on_server_unavailable()
132
- def post(url, data=None, json=None, **kwargs) -> 'requests.Response':
133
- """Send a POST request to the API server, retry on server temporarily
132
+ def request(method, url, **kwargs) -> 'requests.Response':
133
+ """Send a request to the API server, retry on server temporarily
134
134
  unavailable."""
135
- response = requests.post(url, data=data, json=json, **kwargs)
135
+ response = requests.request(method, url, **kwargs)
136
136
  handle_server_unavailable(response)
137
137
  return response
138
138
 
139
139
 
140
- @retry_on_server_unavailable()
141
- def get(url, params=None, **kwargs) -> 'requests.Response':
142
- """Send a GET request to the API server, retry on server temporarily
143
- unavailable."""
144
- response = requests.get(url, params=params, **kwargs)
145
- handle_server_unavailable(response)
146
- return response
147
-
148
-
149
- def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
150
- """Send a GET request to the API server without retry."""
151
- response = requests.get(url, params=params, **kwargs)
140
+ def request_without_retry(method, url, **kwargs) -> 'requests.Response':
141
+ """Send a request to the API server without retry."""
142
+ response = requests.request(method, url, **kwargs)
152
143
  handle_server_unavailable(response)
153
144
  return response
sky/server/server.py CHANGED
@@ -119,8 +119,11 @@ def _basic_auth_401_response(content: str):
119
119
  # TODO(hailong): Remove this function and use request.state.auth_user instead.
120
120
  async def _override_user_info_in_request_body(request: fastapi.Request,
121
121
  auth_user: Optional[models.User]):
122
+ if auth_user is None:
123
+ return
124
+
122
125
  body = await request.body()
123
- if auth_user and body:
126
+ if body:
124
127
  try:
125
128
  original_json = await request.json()
126
129
  except json.JSONDecodeError as e:
@@ -228,14 +231,17 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
228
231
 
229
232
  async def dispatch(self, request: fastapi.Request, call_next):
230
233
  if request.url.path.startswith('/api/'):
231
- # Try to set the auth user from the basic auth header so the
232
- # following endpoint handlers can leverage the auth_user info
234
+ # Try to set the auth user from basic auth
233
235
  _try_set_basic_auth_user(request)
234
236
  return await call_next(request)
235
237
 
236
238
  auth_header = request.headers.get('authorization')
237
- if not auth_header or not auth_header.lower().startswith('basic '):
238
- return _basic_auth_401_response('Invalid basic auth')
239
+ if not auth_header:
240
+ return _basic_auth_401_response('Authentication required')
241
+
242
+ # Only handle basic auth
243
+ if not auth_header.lower().startswith('basic '):
244
+ return _basic_auth_401_response('Invalid authentication method')
239
245
 
240
246
  # Check username and password
241
247
  encoded = auth_header.split(' ', 1)[1]
@@ -267,6 +273,111 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
267
273
  return await call_next(request)
268
274
 
269
275
 
276
+ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
277
+ """Middleware to handle Bearer Token Auth (Service Accounts)."""
278
+
279
+ async def dispatch(self, request: fastapi.Request, call_next):
280
+ # Only process requests with Bearer token authorization header
281
+ auth_header = request.headers.get('authorization')
282
+ if not auth_header or not auth_header.lower().startswith('bearer '):
283
+ # No Bearer token, continue with normal processing (OAuth2 cookies,
284
+ # etc.)
285
+ return await call_next(request)
286
+
287
+ # Extract token
288
+ sa_token = auth_header.split(' ', 1)[1]
289
+
290
+ # Handle SkyPilot service account tokens
291
+ if sa_token.startswith('sky_'):
292
+ return await self._handle_service_account_token(
293
+ request, sa_token, call_next)
294
+
295
+ # Handle other Bearer tokens (OAuth2 access tokens, etc.)
296
+ # These requests bypassed OAuth2 proxy, so let the application decide
297
+ # how to handle them
298
+ # For now, we'll let them continue through normal processing
299
+ logger.debug(
300
+ 'Non-SkyPilot Bearer token detected, continuing with normal '
301
+ 'processing')
302
+ return await call_next(request)
303
+
304
+ async def _handle_service_account_token(self, request: fastapi.Request,
305
+ sa_token: str, call_next):
306
+ """Handle SkyPilot service account tokens."""
307
+ # Check if service account tokens are enabled
308
+ sa_enabled = os.environ.get(constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
309
+ 'false').lower()
310
+ if sa_enabled != 'true':
311
+ return fastapi.responses.JSONResponse(
312
+ status_code=401,
313
+ content={'detail': 'Service account authentication disabled'})
314
+
315
+ try:
316
+ # Import here to avoid circular imports
317
+ # pylint: disable=import-outside-toplevel
318
+ from sky.users.token_service import token_service
319
+
320
+ # Verify and decode JWT token
321
+ payload = token_service.verify_token(sa_token)
322
+
323
+ if payload is None:
324
+ logger.warning('Service account token verification failed')
325
+ return fastapi.responses.JSONResponse(
326
+ status_code=401,
327
+ content={
328
+ 'detail': 'Invalid or expired service account token'
329
+ })
330
+
331
+ # Extract user information from JWT payload
332
+ user_id = payload.get('sub')
333
+ user_name = payload.get('name')
334
+ token_id = payload.get('token_id')
335
+
336
+ if not user_id or not token_id:
337
+ logger.warning(
338
+ 'Invalid token payload: missing user_id or token_id')
339
+ return fastapi.responses.JSONResponse(
340
+ status_code=401,
341
+ content={'detail': 'Invalid token payload'})
342
+
343
+ # Verify user still exists in database
344
+ user_info = global_user_state.get_user(user_id)
345
+ if user_info is None:
346
+ logger.warning(
347
+ f'Service account user {user_id} no longer exists')
348
+ return fastapi.responses.JSONResponse(
349
+ status_code=401,
350
+ content={'detail': 'Service account user no longer exists'})
351
+
352
+ # Update last used timestamp for token tracking
353
+ try:
354
+ global_user_state.update_service_account_token_last_used(
355
+ token_id)
356
+ except Exception as e: # pylint: disable=broad-except
357
+ logger.debug(f'Failed to update token last used time: {e}')
358
+
359
+ # Set the authenticated user
360
+ auth_user = models.User(id=user_id,
361
+ name=user_name or user_info.name)
362
+ request.state.auth_user = auth_user
363
+
364
+ # Override user info in request body for service account requests
365
+ await _override_user_info_in_request_body(request, auth_user)
366
+
367
+ logger.debug(f'Authenticated service account: {user_id}')
368
+
369
+ except Exception as e: # pylint: disable=broad-except
370
+ logger.error(f'Service account authentication failed: {e}',
371
+ exc_info=True)
372
+ return fastapi.responses.JSONResponse(
373
+ status_code=401,
374
+ content={
375
+ 'detail': f'Service account authentication failed: {str(e)}'
376
+ })
377
+
378
+ return await call_next(request)
379
+
380
+
270
381
  class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
271
382
  """Middleware to handle auth proxy."""
272
383
 
@@ -330,7 +441,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
330
441
  request_id=event.id,
331
442
  request_name=event.name,
332
443
  request_body=payloads.RequestBody(),
333
- func=event.event_fn,
444
+ func=event.run_event,
334
445
  schedule_type=requests_lib.ScheduleType.SHORT,
335
446
  is_skypilot_system=True,
336
447
  )
@@ -424,6 +535,9 @@ app.add_middleware(
424
535
  enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
425
536
  if str(enable_basic_auth).lower() == 'true':
426
537
  app.add_middleware(BasicAuthMiddleware)
538
+ # Bearer token middleware should always be present to handle service account
539
+ # authentication
540
+ app.add_middleware(BearerTokenMiddleware)
427
541
  app.add_middleware(AuthProxyMiddleware)
428
542
  app.add_middleware(RequestIDMiddleware)
429
543
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
@@ -1339,6 +1453,7 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1339
1453
  - commit: str; The commit hash of SkyPilot used for API server.
1340
1454
  """
1341
1455
  user = request.state.auth_user
1456
+ logger.info(f'Health endpoint: request.state.auth_user = {user}')
1342
1457
  return {
1343
1458
  'status': common.ApiServerStatus.HEALTHY.value,
1344
1459
  'api_version': server_constants.API_VERSION,