skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,487 @@
1
+ """Payloads for the Sky API requests.
2
+
3
+ TODO(zhwu): We can consider a better way to handle the default values of the
4
+ kwargs for the payloads, otherwise, we have to keep the default values the sync
5
+ with the backend functions. The benefit of having the default values in the
6
+ payloads is that a user can find the default values in the Restful API docs.
7
+ """
8
+ import getpass
9
+ import json
10
+ import os
11
+ from typing import Any, Dict, List, Optional, Tuple, Union
12
+
13
+ import pydantic
14
+
15
+ from sky import serve
16
+ from sky import sky_logging
17
+ from sky import skypilot_config
18
+ from sky.server import common
19
+ from sky.skylet import constants
20
+ from sky.usage import constants as usage_constants
21
+ from sky.usage import usage_lib
22
+ from sky.utils import annotations
23
+ from sky.utils import common as common_lib
24
+ from sky.utils import common_utils
25
+ from sky.utils import registry
26
+
27
+ logger = sky_logging.init_logger(__name__)
28
+
29
+
30
+ @annotations.lru_cache(scope='global')
31
+ def request_body_env_vars() -> dict:
32
+ env_vars = {}
33
+ for env_var in os.environ:
34
+ if env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX):
35
+ env_vars[env_var] = os.environ[env_var]
36
+ env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
37
+ env_vars[constants.USER_ENV_VAR] = os.getenv(constants.USER_ENV_VAR,
38
+ getpass.getuser())
39
+ env_vars[
40
+ usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
41
+ # Remove the path to config file, as the config content is included in the
42
+ # request body and will be merged with the config on the server side.
43
+ env_vars.pop(skypilot_config.ENV_VAR_SKYPILOT_CONFIG, None)
44
+ return env_vars
45
+
46
+
47
+ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
48
+ """Returns the override configs from the client."""
49
+ config = skypilot_config.to_dict()
50
+ # Remove the API server config, as we should not specify the SkyPilot
51
+ # server endpoint on the server side. This avoids the warning below.
52
+ config.pop_nested(('api_server',), default_value=None)
53
+ ignored_key_values = {}
54
+ for nested_key in constants.SKIPPED_CLIENT_OVERRIDE_KEYS:
55
+ value = config.pop_nested(nested_key, default_value=None)
56
+ if value is not None:
57
+ ignored_key_values['.'.join(nested_key)] = value
58
+ if ignored_key_values:
59
+ logger.debug(f'The following keys ({json.dumps(ignored_key_values)}) '
60
+ 'are specified in the client SkyPilot config at '
61
+ f'{skypilot_config.loaded_config_path()!r}. '
62
+ 'This will be ignored. If you want to specify it, '
63
+ 'please modify it on server side or contact your '
64
+ 'administrator.')
65
+ return config
66
+
67
+
68
+ class RequestBody(pydantic.BaseModel):
69
+ """The request body for the SkyPilot API."""
70
+ env_vars: Dict[str, str] = {}
71
+ entrypoint: str = ''
72
+ entrypoint_command: str = ''
73
+ using_remote_api_server: bool = False
74
+ override_skypilot_config: Optional[Dict[str, Any]] = {}
75
+
76
+ def __init__(self, **data):
77
+ data['env_vars'] = data.get('env_vars', request_body_env_vars())
78
+ usage_lib_entrypoint = usage_lib.messages.usage.entrypoint
79
+ if usage_lib_entrypoint is None:
80
+ usage_lib_entrypoint = ''
81
+ data['entrypoint'] = data.get('entrypoint', usage_lib_entrypoint)
82
+ data['entrypoint_command'] = data.get(
83
+ 'entrypoint_command', common_utils.get_pretty_entrypoint_cmd())
84
+ data['using_remote_api_server'] = data.get(
85
+ 'using_remote_api_server', not common.is_api_server_local())
86
+ data['override_skypilot_config'] = data.get(
87
+ 'override_skypilot_config',
88
+ get_override_skypilot_config_from_client())
89
+ super().__init__(**data)
90
+
91
+ def to_kwargs(self) -> Dict[str, Any]:
92
+ """Convert the request body to a kwargs dictionary on API server.
93
+
94
+ This converts the request body into kwargs for the underlying SkyPilot
95
+ backend's function.
96
+ """
97
+ kwargs = self.model_dump()
98
+ kwargs.pop('env_vars')
99
+ kwargs.pop('entrypoint')
100
+ kwargs.pop('entrypoint_command')
101
+ kwargs.pop('using_remote_api_server')
102
+ kwargs.pop('override_skypilot_config')
103
+ return kwargs
104
+
105
+ @property
106
+ def user_hash(self) -> Optional[str]:
107
+ return self.env_vars.get(constants.USER_ID_ENV_VAR)
108
+
109
+
110
+ class CheckBody(RequestBody):
111
+ """The request body for the check endpoint."""
112
+ clouds: Optional[Tuple[str, ...]]
113
+ verbose: bool
114
+
115
+
116
+ class ValidateBody(RequestBody):
117
+ """The request body for the validate endpoint."""
118
+ dag: str
119
+
120
+
121
+ class OptimizeBody(RequestBody):
122
+ """The request body for the optimize endpoint."""
123
+ dag: str
124
+ minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
125
+
126
+ def to_kwargs(self) -> Dict[str, Any]:
127
+ # Import here to avoid requirement of the whole SkyPilot dependency on
128
+ # local clients.
129
+ # pylint: disable=import-outside-toplevel
130
+ from sky.utils import dag_utils
131
+
132
+ kwargs = super().to_kwargs()
133
+
134
+ dag = dag_utils.load_chain_dag_from_yaml_str(self.dag)
135
+ # We should not validate the dag here, as the file mounts are not
136
+ # processed yet, but we need to validate the resources during the
137
+ # optimization to make sure the resources are available.
138
+ kwargs['dag'] = dag
139
+ return kwargs
140
+
141
+
142
+ class LaunchBody(RequestBody):
143
+ """The request body for the launch endpoint."""
144
+ task: str
145
+ cluster_name: str
146
+ retry_until_up: bool = False
147
+ idle_minutes_to_autostop: Optional[int] = None
148
+ dryrun: bool = False
149
+ down: bool = False
150
+ backend: Optional[str] = None
151
+ optimize_target: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
152
+ no_setup: bool = False
153
+ clone_disk_from: Optional[str] = None
154
+ fast: bool = False
155
+ # Internal only:
156
+ # pylint: disable=invalid-name
157
+ quiet_optimizer: bool = False
158
+ is_launched_by_jobs_controller: bool = False
159
+ is_launched_by_sky_serve_controller: bool = False
160
+ disable_controller_check: bool = False
161
+
162
+ def to_kwargs(self) -> Dict[str, Any]:
163
+
164
+ kwargs = super().to_kwargs()
165
+ dag = common.process_mounts_in_task_on_api_server(self.task,
166
+ self.env_vars,
167
+ workdir_only=False)
168
+
169
+ backend_cls = registry.BACKEND_REGISTRY.from_str(self.backend)
170
+ backend = backend_cls() if backend_cls is not None else None
171
+ kwargs['task'] = dag
172
+ kwargs['backend'] = backend
173
+ kwargs['_quiet_optimizer'] = kwargs.pop('quiet_optimizer')
174
+ kwargs['_is_launched_by_jobs_controller'] = kwargs.pop(
175
+ 'is_launched_by_jobs_controller')
176
+ kwargs['_is_launched_by_sky_serve_controller'] = kwargs.pop(
177
+ 'is_launched_by_sky_serve_controller')
178
+ kwargs['_disable_controller_check'] = kwargs.pop(
179
+ 'disable_controller_check')
180
+ return kwargs
181
+
182
+
183
+ class ExecBody(RequestBody):
184
+ """The request body for the exec endpoint."""
185
+ task: str
186
+ cluster_name: str
187
+ dryrun: bool = False
188
+ down: bool = False
189
+ backend: Optional[str] = None
190
+
191
+ def to_kwargs(self) -> Dict[str, Any]:
192
+
193
+ kwargs = super().to_kwargs()
194
+ dag = common.process_mounts_in_task_on_api_server(self.task,
195
+ self.env_vars,
196
+ workdir_only=True)
197
+ backend_cls = registry.BACKEND_REGISTRY.from_str(self.backend)
198
+ backend = backend_cls() if backend_cls is not None else None
199
+ kwargs['task'] = dag
200
+ kwargs['backend'] = backend
201
+ return kwargs
202
+
203
+
204
+ class StopOrDownBody(RequestBody):
205
+ cluster_name: str
206
+ purge: bool = False
207
+
208
+
209
+ class StatusBody(RequestBody):
210
+ """The request body for the status endpoint."""
211
+ cluster_names: Optional[List[str]] = None
212
+ refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
213
+ all_users: bool = True
214
+
215
+
216
+ class StartBody(RequestBody):
217
+ """The request body for the start endpoint."""
218
+ cluster_name: str
219
+ idle_minutes_to_autostop: Optional[int] = None
220
+ retry_until_up: bool = False
221
+ down: bool = False
222
+ force: bool = False
223
+
224
+
225
+ class AutostopBody(RequestBody):
226
+ """The request body for the autostop endpoint."""
227
+ cluster_name: str
228
+ idle_minutes: int
229
+ down: bool = False
230
+
231
+
232
+ class QueueBody(RequestBody):
233
+ """The request body for the queue endpoint."""
234
+ cluster_name: str
235
+ skip_finished: bool = False
236
+ all_users: bool = False
237
+
238
+
239
+ class CancelBody(RequestBody):
240
+ """The request body for the cancel endpoint."""
241
+ cluster_name: str
242
+ job_ids: Optional[List[int]]
243
+ all: bool = False
244
+ all_users: bool = False
245
+ # Internal only. We cannot use prefix `_` because pydantic will not
246
+ # include it in the request body.
247
+ try_cancel_if_cluster_is_init: bool = False
248
+
249
+ def to_kwargs(self) -> Dict[str, Any]:
250
+ kwargs = super().to_kwargs()
251
+ kwargs['_try_cancel_if_cluster_is_init'] = kwargs.pop(
252
+ 'try_cancel_if_cluster_is_init')
253
+ return kwargs
254
+
255
+
256
+ class ClusterNameBody(RequestBody):
257
+ """Cluster node."""
258
+ cluster_name: str
259
+
260
+
261
+ class ClusterJobBody(RequestBody):
262
+ """The request body for the cluster job endpoint."""
263
+ cluster_name: str
264
+ job_id: Optional[int]
265
+ follow: bool = True
266
+ tail: int = 0
267
+
268
+
269
+ class ClusterJobsBody(RequestBody):
270
+ """The request body for the cluster jobs endpoint."""
271
+ cluster_name: str
272
+ job_ids: Optional[List[str]]
273
+
274
+
275
+ class ClusterJobsDownloadLogsBody(RequestBody):
276
+ """The request body for the cluster jobs download logs endpoint."""
277
+ cluster_name: str
278
+ job_ids: Optional[List[str]]
279
+ local_dir: str = constants.SKY_LOGS_DIRECTORY
280
+
281
+
282
+ class DownloadBody(RequestBody):
283
+ """The request body for the download endpoint."""
284
+ folder_paths: List[str]
285
+
286
+
287
+ class StorageBody(RequestBody):
288
+ """The request body for the storage endpoint."""
289
+ name: str
290
+
291
+
292
+ class EndpointsBody(RequestBody):
293
+ """The request body for the endpoint."""
294
+ cluster: str
295
+ port: Optional[Union[int, str]] = None
296
+
297
+
298
+ class ServeEndpointBody(RequestBody):
299
+ """The request body for the serve controller endpoint."""
300
+ port: Optional[Union[int, str]] = None
301
+
302
+
303
+ class JobStatusBody(RequestBody):
304
+ """The request body for the job status endpoint."""
305
+ cluster_name: str
306
+ job_ids: Optional[List[int]]
307
+
308
+
309
+ class JobsLaunchBody(RequestBody):
310
+ """The request body for the jobs launch endpoint."""
311
+ task: str
312
+ name: Optional[str]
313
+
314
+ def to_kwargs(self) -> Dict[str, Any]:
315
+ kwargs = super().to_kwargs()
316
+ kwargs['task'] = common.process_mounts_in_task_on_api_server(
317
+ self.task, self.env_vars, workdir_only=False)
318
+ return kwargs
319
+
320
+
321
+ class JobsQueueBody(RequestBody):
322
+ """The request body for the jobs queue endpoint."""
323
+ refresh: bool = False
324
+ skip_finished: bool = False
325
+ all_users: bool = False
326
+
327
+
328
+ class JobsCancelBody(RequestBody):
329
+ """The request body for the jobs cancel endpoint."""
330
+ name: Optional[str]
331
+ job_ids: Optional[List[int]]
332
+ all: bool = False
333
+ all_users: bool = False
334
+
335
+
336
+ class JobsLogsBody(RequestBody):
337
+ """The request body for the jobs logs endpoint."""
338
+ name: Optional[str] = None
339
+ job_id: Optional[int] = None
340
+ follow: bool = True
341
+ controller: bool = False
342
+ refresh: bool = False
343
+
344
+
345
+ class RequestCancelBody(RequestBody):
346
+ """The request body for the API request cancellation endpoint."""
347
+ # Kill all requests if request_ids is None.
348
+ request_ids: Optional[List[str]] = None
349
+ user_id: Optional[str] = None
350
+
351
+
352
+ class RequestStatusBody(pydantic.BaseModel):
353
+ """The request body for the API request status endpoint."""
354
+ request_ids: Optional[List[str]] = None
355
+ all_status: bool = False
356
+
357
+
358
+ class ServeUpBody(RequestBody):
359
+ """The request body for the serve up endpoint."""
360
+ task: str
361
+ service_name: str
362
+
363
+ def to_kwargs(self) -> Dict[str, Any]:
364
+ kwargs = super().to_kwargs()
365
+ dag = common.process_mounts_in_task_on_api_server(self.task,
366
+ self.env_vars,
367
+ workdir_only=False)
368
+ assert len(
369
+ dag.tasks) == 1, ('Must only specify one task in the DAG for '
370
+ 'a service.', dag)
371
+ kwargs['task'] = dag.tasks[0]
372
+ return kwargs
373
+
374
+
375
+ class ServeUpdateBody(RequestBody):
376
+ """The request body for the serve update endpoint."""
377
+ task: str
378
+ service_name: str
379
+ mode: serve.UpdateMode
380
+
381
+ def to_kwargs(self) -> Dict[str, Any]:
382
+ kwargs = super().to_kwargs()
383
+ dag = common.process_mounts_in_task_on_api_server(self.task,
384
+ self.env_vars,
385
+ workdir_only=False)
386
+ assert len(
387
+ dag.tasks) == 1, ('Must only specify one task in the DAG for '
388
+ 'a service.', dag)
389
+ kwargs['task'] = dag.tasks[0]
390
+ return kwargs
391
+
392
+
393
+ class ServeDownBody(RequestBody):
394
+ """The request body for the serve down endpoint."""
395
+ service_names: Optional[Union[str, List[str]]]
396
+ all: bool = False
397
+ purge: bool = False
398
+
399
+
400
+ class ServeLogsBody(RequestBody):
401
+ """The request body for the serve logs endpoint."""
402
+ service_name: str
403
+ target: Union[str, serve.ServiceComponent]
404
+ replica_id: Optional[int] = None
405
+ follow: bool = True
406
+
407
+
408
+ class ServeStatusBody(RequestBody):
409
+ """The request body for the serve status endpoint."""
410
+ service_names: Optional[Union[str, List[str]]]
411
+
412
+
413
+ class RealtimeGpuAvailabilityRequestBody(RequestBody):
414
+ """The request body for the realtime GPU availability endpoint."""
415
+ context: Optional[str]
416
+ name_filter: Optional[str]
417
+ quantity_filter: Optional[int]
418
+
419
+
420
+ class KubernetesNodeInfoRequestBody(RequestBody):
421
+ """The request body for the kubernetes node info endpoint."""
422
+ context: Optional[str] = None
423
+
424
+
425
+ class ListAcceleratorsBody(RequestBody):
426
+ """The request body for the list accelerators endpoint."""
427
+ gpus_only: bool = True
428
+ name_filter: Optional[str] = None
429
+ region_filter: Optional[str] = None
430
+ quantity_filter: Optional[int] = None
431
+ clouds: Optional[Union[List[str], str]] = None
432
+ all_regions: bool = False
433
+ require_price: bool = True
434
+ case_sensitive: bool = True
435
+
436
+
437
+ class ListAcceleratorCountsBody(RequestBody):
438
+ """The request body for the list accelerator counts endpoint."""
439
+ gpus_only: bool = True
440
+ name_filter: Optional[str] = None
441
+ region_filter: Optional[str] = None
442
+ quantity_filter: Optional[int] = None
443
+ clouds: Optional[Union[List[str], str]] = None
444
+
445
+
446
+ class LocalUpBody(RequestBody):
447
+ """The request body for the local up endpoint."""
448
+ gpus: bool = True
449
+ ips: Optional[List[str]] = None
450
+ ssh_user: Optional[str] = None
451
+ ssh_key: Optional[str] = None
452
+ cleanup: bool = False
453
+
454
+
455
+ class ServeTerminateReplicaBody(RequestBody):
456
+ """The request body for the serve terminate replica endpoint."""
457
+ service_name: str
458
+ replica_id: int
459
+ purge: bool = False
460
+
461
+
462
+ class KillRequestProcessesBody(RequestBody):
463
+ """The request body for the kill request processes endpoint."""
464
+ request_ids: List[str]
465
+
466
+
467
+ class StreamBody(pydantic.BaseModel):
468
+ """The request body for the stream endpoint."""
469
+ request_id: Optional[str] = None
470
+ log_path: Optional[str] = None
471
+ tail: Optional[int] = None
472
+ plain_logs: bool = True
473
+
474
+
475
+ class JobsDownloadLogsBody(RequestBody):
476
+ """The request body for the jobs download logs endpoint."""
477
+ name: Optional[str]
478
+ job_id: Optional[int]
479
+ refresh: bool = False
480
+ controller: bool = False
481
+ local_dir: str = constants.SKY_LOGS_DIRECTORY
482
+
483
+
484
+ class UploadZipFileResponse(pydantic.BaseModel):
485
+ """The response body for the upload zip file endpoint."""
486
+ status: str
487
+ missing_chunks: Optional[List[str]] = None
File without changes
@@ -0,0 +1,76 @@
1
+ """Shared queues for multiprocessing."""
2
+ from multiprocessing import managers
3
+ import queue
4
+ import time
5
+ from typing import List
6
+
7
+ from sky import sky_logging
8
+
9
+ logger = sky_logging.init_logger(__name__)
10
+
11
+ # The default port used by SkyPilot API server's request queue.
12
+ # We avoid 50010, as it might be taken by HDFS.
13
+ DEFAULT_QUEUE_MANAGER_PORT = 50011
14
+
15
+
16
+ # Have to create custom manager to handle different processes connecting to the
17
+ # same manager and getting the same queues.
18
+ class QueueManager(managers.BaseManager):
19
+ pass
20
+
21
+
22
+ def start_queue_manager(queue_names: List[str],
23
+ port: int = DEFAULT_QUEUE_MANAGER_PORT) -> None:
24
+ # Defining a local function instead of a lambda function
25
+ # (e.g. lambda: q) because the lambda function captures q by
26
+ # reference, so by the time lambda is called, the loop has already
27
+ # reached the last q item, causing the manager to always return
28
+ # the last q item.
29
+ def queue_getter(q_obj):
30
+ return lambda: q_obj
31
+
32
+ for name in queue_names:
33
+ q_obj: queue.Queue = queue.Queue()
34
+ QueueManager.register(name, callable=queue_getter(q_obj))
35
+
36
+ # Start long-running manager server.
37
+ # Manager will set socket.SO_REUSEADDR, but BSD and Linux have different
38
+ # behaviors on this option:
39
+ # - BSD(e.g. MacOS): * (0.0.0.0) does not conflict with other addresses on
40
+ # the same port
41
+ # - Linux: in the contrary, * conflicts with any other addresses
42
+ # So on BSD systems, binding to * while the port is already bound to
43
+ # localhost (127.0.0.1) will succeed, but the Manager won't actually be able
44
+ # to accept connections on localhost.
45
+ # For portability, we use the loopback address instead of *.
46
+ manager = QueueManager(address=('localhost', port), authkey=b'skypilot')
47
+ server = manager.get_server()
48
+ server.serve_forever()
49
+
50
+
51
+ def get_queue(queue_name: str,
52
+ port: int = DEFAULT_QUEUE_MANAGER_PORT) -> queue.Queue:
53
+ QueueManager.register(queue_name)
54
+ manager = QueueManager(address=('localhost', port), authkey=b'skypilot')
55
+ manager.connect()
56
+ return getattr(manager, queue_name)()
57
+
58
+
59
+ def wait_for_queues_to_be_ready(queue_names: List[str],
60
+ port: int = DEFAULT_QUEUE_MANAGER_PORT) -> None:
61
+ """Wait for the queues to be ready after queue manager is just started."""
62
+ initial_time = time.time()
63
+ max_wait_time = 5
64
+ while queue_names:
65
+ try:
66
+ get_queue(queue_names[0], port)
67
+ queue_names.pop(0)
68
+ break
69
+ except ConnectionRefusedError as e: # pylint: disable=broad-except
70
+ logger.info(f'Waiting for request queue, named {queue_names[0]!r}, '
71
+ f'to be ready...')
72
+ time.sleep(0.2)
73
+ if time.time() - initial_time > max_wait_time:
74
+ raise RuntimeError(
75
+ f'Request queue, named {queue_names[0]!r}, '
76
+ f'is not ready after {max_wait_time} seconds.') from e