skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,112 @@
1
+ """Rest APIs for SkyServe."""
2
+
3
+ import fastapi
4
+
5
+ from sky import sky_logging
6
+ from sky.serve.server import core
7
+ from sky.server import stream_utils
8
+ from sky.server.requests import executor
9
+ from sky.server.requests import payloads
10
+ from sky.server.requests import requests as api_requests
11
+ from sky.utils import common
12
+
13
+ logger = sky_logging.init_logger(__name__)
14
+ router = fastapi.APIRouter()
15
+
16
+
17
+ @router.post('/up')
18
+ async def up(
19
+ request: fastapi.Request,
20
+ up_body: payloads.ServeUpBody,
21
+ ) -> None:
22
+ executor.schedule_request(
23
+ request_id=request.state.request_id,
24
+ request_name='serve.up',
25
+ request_body=up_body,
26
+ func=core.up,
27
+ schedule_type=api_requests.ScheduleType.LONG,
28
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
29
+ )
30
+
31
+
32
+ @router.post('/update')
33
+ async def update(
34
+ request: fastapi.Request,
35
+ update_body: payloads.ServeUpdateBody,
36
+ ) -> None:
37
+ executor.schedule_request(
38
+ request_id=request.state.request_id,
39
+ request_name='serve.update',
40
+ request_body=update_body,
41
+ func=core.update,
42
+ schedule_type=api_requests.ScheduleType.SHORT,
43
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
44
+ )
45
+
46
+
47
+ @router.post('/down')
48
+ async def down(
49
+ request: fastapi.Request,
50
+ down_body: payloads.ServeDownBody,
51
+ ) -> None:
52
+ executor.schedule_request(
53
+ request_id=request.state.request_id,
54
+ request_name='serve.down',
55
+ request_body=down_body,
56
+ func=core.down,
57
+ schedule_type=api_requests.ScheduleType.SHORT,
58
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
59
+ )
60
+
61
+
62
+ @router.post('/terminate-replica')
63
+ async def terminate_replica(
64
+ request: fastapi.Request,
65
+ terminate_replica_body: payloads.ServeTerminateReplicaBody,
66
+ ) -> None:
67
+ executor.schedule_request(
68
+ request_id=request.state.request_id,
69
+ request_name='serve.terminate_replica',
70
+ request_body=terminate_replica_body,
71
+ func=core.terminate_replica,
72
+ schedule_type=api_requests.ScheduleType.SHORT,
73
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
74
+ )
75
+
76
+
77
+ @router.post('/status')
78
+ async def status(
79
+ request: fastapi.Request,
80
+ status_body: payloads.ServeStatusBody,
81
+ ) -> None:
82
+ executor.schedule_request(
83
+ request_id=request.state.request_id,
84
+ request_name='serve.status',
85
+ request_body=status_body,
86
+ func=core.status,
87
+ schedule_type=api_requests.ScheduleType.SHORT,
88
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
89
+ )
90
+
91
+
92
+ @router.post('/logs')
93
+ async def tail_logs(
94
+ request: fastapi.Request, log_body: payloads.ServeLogsBody,
95
+ background_tasks: fastapi.BackgroundTasks
96
+ ) -> fastapi.responses.StreamingResponse:
97
+ executor.schedule_request(
98
+ request_id=request.state.request_id,
99
+ request_name='serve.logs',
100
+ request_body=log_body,
101
+ func=core.tail_logs,
102
+ schedule_type=api_requests.ScheduleType.SHORT,
103
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
104
+ )
105
+
106
+ request_task = api_requests.get_request(request.state.request_id)
107
+
108
+ return stream_utils.stream_response(
109
+ request_id=request_task.request_id,
110
+ logs_path=request_task.log_path,
111
+ background_tasks=background_tasks,
112
+ )
sky/serve/service.py CHANGED
@@ -9,7 +9,7 @@ import pathlib
9
9
  import shutil
10
10
  import time
11
11
  import traceback
12
- from typing import Dict, List
12
+ from typing import Dict
13
13
 
14
14
  import filelock
15
15
 
@@ -73,6 +73,12 @@ def cleanup_storage(task_yaml: str) -> bool:
73
73
  try:
74
74
  task = task_lib.Task.from_yaml(task_yaml)
75
75
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
76
+ # Need to re-construct storage object in the controller process
77
+ # because when SkyPilot API server machine sends the yaml config to the
78
+ # controller machine, only storage metadata is sent, not the storage
79
+ # object itself.
80
+ for storage in task.storage_mounts.values():
81
+ storage.construct()
76
82
  backend.teardown_ephemeral_storage(task)
77
83
  except Exception as e: # pylint: disable=broad-except
78
84
  logger.error('Failed to clean up storage: '
@@ -116,15 +122,17 @@ def _cleanup(service_name: str) -> bool:
116
122
  logger.error(f'Replica {info.replica_id} failed to terminate.')
117
123
  versions = serve_state.get_service_versions(service_name)
118
124
  serve_state.remove_service_versions(service_name)
119
- success = True
120
- for version in versions:
125
+
126
+ def cleanup_version_storage(version: int) -> bool:
121
127
  task_yaml: str = serve_utils.generate_task_yaml_file_name(
122
128
  service_name, version)
123
129
  logger.info(f'Cleaning up storage for version {version}, '
124
130
  f'task_yaml: {task_yaml}')
125
- success = success and cleanup_storage(task_yaml)
126
- if not success:
131
+ return cleanup_storage(task_yaml)
132
+
133
+ if not all(map(cleanup_version_storage, versions)):
127
134
  failed = True
135
+
128
136
  return failed
129
137
 
130
138
 
@@ -148,7 +156,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
148
156
  controller_job_id=job_id,
149
157
  policy=service_spec.autoscaling_policy_str(),
150
158
  requested_resources_str=backend_utils.get_task_resources_str(task),
151
- status=serve_state.ServiceStatus.CONTROLLER_INIT)
159
+ load_balancing_policy=service_spec.load_balancing_policy,
160
+ status=serve_state.ServiceStatus.CONTROLLER_INIT,
161
+ tls_encrypted=service_spec.tls_credential is not None)
152
162
  # Directly throw an error here. See sky/serve/api.py::up
153
163
  # for more details.
154
164
  if not success:
@@ -156,6 +166,10 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
156
166
  with ux_utils.print_exception_no_traceback():
157
167
  raise ValueError(f'Service {service_name} already exists.')
158
168
 
169
+ # Add initial version information to the service state.
170
+ serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION,
171
+ service_spec)
172
+
159
173
  # Create the service working directory.
160
174
  service_dir = os.path.expanduser(
161
175
  serve_utils.generate_remote_service_dir_name(service_name))
@@ -182,19 +196,39 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
182
196
  os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)):
183
197
  controller_port = common_utils.find_free_port(
184
198
  constants.CONTROLLER_PORT_START)
199
+
200
+ # We expose the controller to the public network when running
201
+ # inside a kubernetes cluster to allow external load balancers
202
+ # (example, for high availability load balancers) to communicate
203
+ # with the controller.
204
+ def _get_host():
205
+ if 'KUBERNETES_SERVICE_HOST' in os.environ:
206
+ return '0.0.0.0'
207
+ # Not using localhost to avoid using ipv6 address and causing
208
+ # the following error:
209
+ # ERROR: [Errno 99] error while attempting to bind on address
210
+ # ('::1', 20001, 0, 0): cannot assign requested address
211
+ return '127.0.0.1'
212
+
213
+ controller_host = _get_host()
214
+
185
215
  # Start the controller.
186
216
  controller_process = multiprocessing.Process(
187
217
  target=controller.run_controller,
188
- args=(service_name, service_spec, task_yaml, controller_port))
218
+ args=(service_name, service_spec, task_yaml, controller_host,
219
+ controller_port))
189
220
  controller_process.start()
190
221
  serve_state.set_service_controller_port(service_name,
191
222
  controller_port)
192
223
 
193
- # TODO(tian): Support HTTPS.
194
- controller_addr = f'http://localhost:{controller_port}'
224
+ controller_addr = f'http://{controller_host}:{controller_port}'
225
+
195
226
  load_balancer_port = common_utils.find_free_port(
196
227
  constants.LOAD_BALANCER_PORT_START)
197
228
 
229
+ # Extract the load balancing policy from the service spec
230
+ policy_name = service_spec.load_balancing_policy
231
+
198
232
  # Start the load balancer.
199
233
  # TODO(tian): Probably we could enable multiple ports specified in
200
234
  # service spec and we could start multiple load balancers.
@@ -203,7 +237,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
203
237
  target=ux_utils.RedirectOutputForProcess(
204
238
  load_balancer.run_load_balancer,
205
239
  load_balancer_log_file).run,
206
- args=(controller_addr, load_balancer_port))
240
+ args=(controller_addr, load_balancer_port, policy_name,
241
+ service_spec.tls_credential))
207
242
  load_balancer_process.start()
208
243
  serve_state.set_service_load_balancer_port(service_name,
209
244
  load_balancer_port)
@@ -215,15 +250,15 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
215
250
  serve_state.set_service_status_and_active_versions(
216
251
  service_name, serve_state.ServiceStatus.SHUTTING_DOWN)
217
252
  finally:
218
- process_to_kill: List[multiprocessing.Process] = []
219
- if load_balancer_process is not None:
220
- process_to_kill.append(load_balancer_process)
221
- if controller_process is not None:
222
- process_to_kill.append(controller_process)
223
253
  # Kill load balancer process first since it will raise errors if failed
224
254
  # to connect to the controller. Then the controller process.
255
+ process_to_kill = [
256
+ proc for proc in [load_balancer_process, controller_process]
257
+ if proc is not None
258
+ ]
225
259
  subprocess_utils.kill_children_processes(
226
- [process.pid for process in process_to_kill], force=True)
260
+ parent_pids=[process.pid for process in process_to_kill],
261
+ force=True)
227
262
  for process in process_to_kill:
228
263
  process.join()
229
264
  failed = _cleanup(service_name)
@@ -234,6 +269,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
234
269
  else:
235
270
  shutil.rmtree(service_dir)
236
271
  serve_state.remove_service(service_name)
272
+ serve_state.delete_all_versions(service_name)
237
273
  logger.info(f'Service {service_name} terminated successfully.')
238
274
 
239
275
 
sky/serve/service_spec.py CHANGED
@@ -2,11 +2,14 @@
2
2
  import json
3
3
  import os
4
4
  import textwrap
5
- from typing import Any, Dict, Optional
5
+ from typing import Any, Dict, List, Optional
6
6
 
7
7
  import yaml
8
8
 
9
+ from sky import serve
9
10
  from sky.serve import constants
11
+ from sky.serve import load_balancing_policies as lb_policies
12
+ from sky.serve import serve_utils
10
13
  from sky.utils import common_utils
11
14
  from sky.utils import schemas
12
15
  from sky.utils import ux_utils
@@ -19,22 +22,19 @@ class SkyServiceSpec:
19
22
  self,
20
23
  readiness_path: str,
21
24
  initial_delay_seconds: int,
25
+ readiness_timeout_seconds: int,
22
26
  min_replicas: int,
23
27
  max_replicas: Optional[int] = None,
28
+ ports: Optional[str] = None,
24
29
  target_qps_per_replica: Optional[float] = None,
25
30
  post_data: Optional[Dict[str, Any]] = None,
31
+ tls_credential: Optional[serve_utils.TLSCredential] = None,
26
32
  readiness_headers: Optional[Dict[str, str]] = None,
27
33
  dynamic_ondemand_fallback: Optional[bool] = None,
28
34
  base_ondemand_fallback_replicas: Optional[int] = None,
29
35
  upscale_delay_seconds: Optional[int] = None,
30
36
  downscale_delay_seconds: Optional[int] = None,
31
- # The following arguments are deprecated.
32
- # TODO(ziming): remove this after 2 minor release, i.e. 0.6.0.
33
- # Deprecated: Always be True
34
- auto_restart: Optional[bool] = None,
35
- # Deprecated: replaced by the target_qps_per_replica.
36
- qps_upper_threshold: Optional[float] = None,
37
- qps_lower_threshold: Optional[float] = None,
37
+ load_balancing_policy: Optional[str] = None,
38
38
  ) -> None:
39
39
  if max_replicas is not None and max_replicas < min_replicas:
40
40
  with ux_utils.print_exception_no_traceback():
@@ -61,27 +61,23 @@ class SkyServiceSpec:
61
61
  raise ValueError('readiness_path must start with a slash (/). '
62
62
  f'Got: {readiness_path}')
63
63
 
64
- # TODO(tian): Following field are deprecated. Remove after 2 minor
65
- # release, i.e. 0.6.0.
66
- if qps_upper_threshold is not None or qps_lower_threshold is not None:
64
+ # Add the check for unknown load balancing policies
65
+ if (load_balancing_policy is not None and
66
+ load_balancing_policy not in serve.LB_POLICIES):
67
67
  with ux_utils.print_exception_no_traceback():
68
68
  raise ValueError(
69
- 'Field `qps_upper_threshold` and `qps_lower_threshold`'
70
- 'under `replica_policy` are deprecated. '
71
- 'Please use target_qps_per_replica instead.')
72
- if auto_restart is not None:
73
- with ux_utils.print_exception_no_traceback():
74
- raise ValueError(
75
- 'Field `auto_restart` under `replica_policy` is deprecated.'
76
- 'Currently, SkyServe will cleanup failed replicas'
77
- 'and auto restart it to keep the service running.')
78
-
69
+ f'Unknown load balancing policy: {load_balancing_policy}. '
70
+ f'Available policies: {list(serve.LB_POLICIES.keys())}')
79
71
  self._readiness_path: str = readiness_path
80
72
  self._initial_delay_seconds: int = initial_delay_seconds
73
+ self._readiness_timeout_seconds: int = readiness_timeout_seconds
81
74
  self._min_replicas: int = min_replicas
82
75
  self._max_replicas: Optional[int] = max_replicas
76
+ self._ports: Optional[str] = ports
83
77
  self._target_qps_per_replica: Optional[float] = target_qps_per_replica
84
78
  self._post_data: Optional[Dict[str, Any]] = post_data
79
+ self._tls_credential: Optional[serve_utils.TLSCredential] = (
80
+ tls_credential)
85
81
  self._readiness_headers: Optional[Dict[str, str]] = readiness_headers
86
82
  self._dynamic_ondemand_fallback: Optional[
87
83
  bool] = dynamic_ondemand_fallback
@@ -89,6 +85,7 @@ class SkyServiceSpec:
89
85
  int] = base_ondemand_fallback_replicas
90
86
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
91
87
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
88
+ self._load_balancing_policy: Optional[str] = load_balancing_policy
92
89
 
93
90
  self._use_ondemand_fallback: bool = (
94
91
  self.dynamic_ondemand_fallback is not None and
@@ -113,16 +110,23 @@ class SkyServiceSpec:
113
110
  service_config['readiness_path'] = readiness_section
114
111
  initial_delay_seconds = None
115
112
  post_data = None
113
+ readiness_timeout_seconds = None
116
114
  readiness_headers = None
117
115
  else:
118
116
  service_config['readiness_path'] = readiness_section['path']
119
117
  initial_delay_seconds = readiness_section.get(
120
118
  'initial_delay_seconds', None)
121
119
  post_data = readiness_section.get('post_data', None)
120
+ readiness_timeout_seconds = readiness_section.get(
121
+ 'timeout_seconds', None)
122
122
  readiness_headers = readiness_section.get('headers', None)
123
123
  if initial_delay_seconds is None:
124
124
  initial_delay_seconds = constants.DEFAULT_INITIAL_DELAY_SECONDS
125
125
  service_config['initial_delay_seconds'] = initial_delay_seconds
126
+ if readiness_timeout_seconds is None:
127
+ readiness_timeout_seconds = (
128
+ constants.DEFAULT_READINESS_PROBE_TIMEOUT_SECONDS)
129
+ service_config['readiness_timeout_seconds'] = readiness_timeout_seconds
126
130
  if isinstance(post_data, str):
127
131
  try:
128
132
  post_data = json.loads(post_data)
@@ -135,6 +139,14 @@ class SkyServiceSpec:
135
139
  service_config['post_data'] = post_data
136
140
  service_config['readiness_headers'] = readiness_headers
137
141
 
142
+ ports = config.get('ports', None)
143
+ if ports is not None:
144
+ assert isinstance(ports, int)
145
+ if not 1 <= ports <= 65535:
146
+ with ux_utils.print_exception_no_traceback():
147
+ raise ValueError('Port must be between 1 and 65535.')
148
+ service_config['ports'] = str(ports) if ports is not None else None
149
+
138
150
  policy_section = config.get('replica_policy', None)
139
151
  simplified_policy_section = config.get('replicas', None)
140
152
  if policy_section is None or simplified_policy_section is not None:
@@ -151,14 +163,8 @@ class SkyServiceSpec:
151
163
  service_config['min_replicas'] = policy_section['min_replicas']
152
164
  service_config['max_replicas'] = policy_section.get(
153
165
  'max_replicas', None)
154
- service_config['qps_upper_threshold'] = policy_section.get(
155
- 'qps_upper_threshold', None)
156
- service_config['qps_lower_threshold'] = policy_section.get(
157
- 'qps_lower_threshold', None)
158
166
  service_config['target_qps_per_replica'] = policy_section.get(
159
167
  'target_qps_per_replica', None)
160
- service_config['auto_restart'] = policy_section.get(
161
- 'auto_restart', None)
162
168
  service_config['upscale_delay_seconds'] = policy_section.get(
163
169
  'upscale_delay_seconds', None)
164
170
  service_config['downscale_delay_seconds'] = policy_section.get(
@@ -169,6 +175,16 @@ class SkyServiceSpec:
169
175
  service_config['dynamic_ondemand_fallback'] = policy_section.get(
170
176
  'dynamic_ondemand_fallback', None)
171
177
 
178
+ service_config['load_balancing_policy'] = config.get(
179
+ 'load_balancing_policy', None)
180
+
181
+ tls_section = config.get('tls', None)
182
+ if tls_section is not None:
183
+ service_config['tls_credential'] = serve_utils.TLSCredential(
184
+ keyfile=tls_section.get('keyfile', None),
185
+ certfile=tls_section.get('certfile', None),
186
+ )
187
+
172
188
  return SkyServiceSpec(**service_config)
173
189
 
174
190
  @staticmethod
@@ -192,9 +208,12 @@ class SkyServiceSpec:
192
208
  return SkyServiceSpec.from_yaml_config(config['service'])
193
209
 
194
210
  def to_yaml_config(self) -> Dict[str, Any]:
195
- config = dict()
211
+ config: Dict[str, Any] = {}
196
212
 
197
- def add_if_not_none(section, key, value, no_empty: bool = False):
213
+ def add_if_not_none(section: str,
214
+ key: Optional[str],
215
+ value: Any,
216
+ no_empty: bool = False):
198
217
  if no_empty and not value:
199
218
  return
200
219
  if value is not None:
@@ -209,6 +228,8 @@ class SkyServiceSpec:
209
228
  add_if_not_none('readiness_probe', 'initial_delay_seconds',
210
229
  self.initial_delay_seconds)
211
230
  add_if_not_none('readiness_probe', 'post_data', self.post_data)
231
+ add_if_not_none('readiness_probe', 'timeout_seconds',
232
+ self.readiness_timeout_seconds)
212
233
  add_if_not_none('readiness_probe', 'headers', self._readiness_headers)
213
234
  add_if_not_none('replica_policy', 'min_replicas', self.min_replicas)
214
235
  add_if_not_none('replica_policy', 'max_replicas', self.max_replicas)
@@ -222,6 +243,12 @@ class SkyServiceSpec:
222
243
  self.upscale_delay_seconds)
223
244
  add_if_not_none('replica_policy', 'downscale_delay_seconds',
224
245
  self.downscale_delay_seconds)
246
+ add_if_not_none('load_balancing_policy', None,
247
+ self._load_balancing_policy)
248
+ add_if_not_none('ports', None, int(self.ports) if self.ports else None)
249
+ if self.tls_credential is not None:
250
+ add_if_not_none('tls', 'keyfile', self.tls_credential.keyfile)
251
+ add_if_not_none('tls', 'certfile', self.tls_credential.certfile)
225
252
  return config
226
253
 
227
254
  def probe_str(self):
@@ -233,8 +260,8 @@ class SkyServiceSpec:
233
260
  ' with custom headers')
234
261
  return f'{method}{headers}'
235
262
 
236
- def spot_policy_str(self):
237
- policy_strs = []
263
+ def spot_policy_str(self) -> str:
264
+ policy_strs: List[str] = []
238
265
  if (self.dynamic_ondemand_fallback is not None and
239
266
  self.dynamic_ondemand_fallback):
240
267
  policy_strs.append('Dynamic on-demand fallback')
@@ -249,7 +276,9 @@ class SkyServiceSpec:
249
276
  policy_strs.append('Static spot mixture with '
250
277
  f'{self.base_ondemand_fallback_replicas} '
251
278
  f'base on-demand replica{plural}')
252
- return ' '.join(policy_strs) if policy_strs else 'No spot policy'
279
+ if not policy_strs:
280
+ return 'No spot fallback policy'
281
+ return ' '.join(policy_strs)
253
282
 
254
283
  def autoscaling_policy_str(self):
255
284
  # TODO(MaoZiming): Update policy_str
@@ -264,12 +293,24 @@ class SkyServiceSpec:
264
293
  f'replica{max_plural} (target QPS per replica: '
265
294
  f'{self.target_qps_per_replica})')
266
295
 
296
+ def set_ports(self, ports: str) -> None:
297
+ self._ports = ports
298
+
299
+ def tls_str(self):
300
+ if self.tls_credential is None:
301
+ return 'No TLS Enabled'
302
+ return (f'Keyfile: {self.tls_credential.keyfile}, '
303
+ f'Certfile: {self.tls_credential.certfile}')
304
+
267
305
  def __repr__(self) -> str:
268
306
  return textwrap.dedent(f"""\
269
307
  Readiness probe method: {self.probe_str()}
270
308
  Readiness initial delay seconds: {self.initial_delay_seconds}
309
+ Readiness probe timeout seconds: {self.readiness_timeout_seconds}
271
310
  Replica autoscaling policy: {self.autoscaling_policy_str()}
311
+ TLS Certificates: {self.tls_str()}
272
312
  Spot Policy: {self.spot_policy_str()}
313
+ Load Balancing Policy: {self.load_balancing_policy}
273
314
  """)
274
315
 
275
316
  @property
@@ -280,6 +321,10 @@ class SkyServiceSpec:
280
321
  def initial_delay_seconds(self) -> int:
281
322
  return self._initial_delay_seconds
282
323
 
324
+ @property
325
+ def readiness_timeout_seconds(self) -> int:
326
+ return self._readiness_timeout_seconds
327
+
283
328
  @property
284
329
  def min_replicas(self) -> int:
285
330
  return self._min_replicas
@@ -289,6 +334,10 @@ class SkyServiceSpec:
289
334
  # If None, treated as having the same value of min_replicas.
290
335
  return self._max_replicas
291
336
 
337
+ @property
338
+ def ports(self) -> Optional[str]:
339
+ return self._ports
340
+
292
341
  @property
293
342
  def target_qps_per_replica(self) -> Optional[float]:
294
343
  return self._target_qps_per_replica
@@ -297,6 +346,15 @@ class SkyServiceSpec:
297
346
  def post_data(self) -> Optional[Dict[str, Any]]:
298
347
  return self._post_data
299
348
 
349
+ @property
350
+ def tls_credential(self) -> Optional[serve_utils.TLSCredential]:
351
+ return self._tls_credential
352
+
353
+ @tls_credential.setter
354
+ def tls_credential(self,
355
+ value: Optional[serve_utils.TLSCredential]) -> None:
356
+ self._tls_credential = value
357
+
300
358
  @property
301
359
  def readiness_headers(self) -> Optional[Dict[str, str]]:
302
360
  return self._readiness_headers
@@ -320,3 +378,8 @@ class SkyServiceSpec:
320
378
  @property
321
379
  def use_ondemand_fallback(self) -> bool:
322
380
  return self._use_ondemand_fallback
381
+
382
+ @property
383
+ def load_balancing_policy(self) -> str:
384
+ return lb_policies.LoadBalancingPolicy.make_policy_name(
385
+ self._load_balancing_policy)
sky/server/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """SkyPilot API Server."""