skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,119 @@
1
+ """This module provides functions to generate GraphQL mutations for deploying
2
+ spot instance Pods on RunPod.
3
+
4
+ Reference:
5
+ https://github.com/runpod/runpod-python/blob/main/runpod/api/ctl_commands.py
6
+
7
+ Functions:
8
+ generate_spot_pod_deployment_mutation: Generates a GraphQL mutation string
9
+ for deploying a spot instance Pod on RunPod.
10
+
11
+ Example:
12
+ >>> mutation = generate_spot_pod_deployment_mutation(
13
+ name='test',
14
+ image_name='runpod/stack',
15
+ gpu_type_id='NVIDIA GeForce RTX 3070',
16
+ bid_per_gpu=0.3
17
+ )
18
+ """
19
+ from typing import List, Optional
20
+
21
+ from sky.adaptors import runpod
22
+ from sky.provision.runpod.api.pods import generate_spot_pod_deployment_mutation
23
+
24
+ _INTERRUPTABLE_POD_FIELD: str = 'podRentInterruptable'
25
+ _RESPONSE_DATA_FIELD: str = 'data'
26
+
27
+
28
+ def create_spot_pod(
29
+ name: str,
30
+ image_name: str,
31
+ gpu_type_id: str,
32
+ bid_per_gpu: float,
33
+ cloud_type: str = 'ALL',
34
+ volume_mount_path: str = '/runpod-volume',
35
+ gpu_count: Optional[int] = 1,
36
+ min_memory_in_gb: Optional[int] = 1,
37
+ min_vcpu_count: Optional[int] = 1,
38
+ container_disk_in_gb: Optional[int] = None,
39
+ volume_in_gb: Optional[int] = 0,
40
+ ports: Optional[str] = None,
41
+ start_ssh: Optional[bool] = True,
42
+ start_jupyter: Optional[bool] = False,
43
+ env: Optional[dict] = None,
44
+ docker_args: Optional[str] = '',
45
+ support_public_ip: Optional[bool] = True,
46
+ terminate_after: Optional[str] = None,
47
+ stop_after: Optional[str] = None,
48
+ data_center_id: Optional[str] = None,
49
+ country_code: Optional[str] = None,
50
+ network_volume_id: Optional[str] = None,
51
+ allowed_cuda_versions: Optional[List[str]] = None,
52
+ min_download: Optional[int] = None,
53
+ min_upload: Optional[int] = None,
54
+ cuda_version: Optional[str] = None,
55
+ template_id: Optional[str] = None,
56
+ volume_key: Optional[str] = None,
57
+ ) -> dict:
58
+ """This module provides functions to generate GraphQL mutations for
59
+ deploying spot instance Pods on RunPod.
60
+
61
+ Functions:
62
+ generate_spot_pod_deployment_mutation: Generates a GraphQL mutation
63
+ string for deploying a spot instance Pod on RunPod.
64
+
65
+ Example:
66
+ >>> mutation = generate_spot_pod_deployment_mutation(
67
+ name='test',
68
+ image_name='runpod/stack',
69
+ gpu_type_id='NVIDIA GeForce RTX 3070',
70
+ bid_per_gpu=0.3
71
+ )
72
+ """
73
+ runpod.runpod.get_gpu(gpu_type_id)
74
+ # refer to https://graphql-spec.runpod.io/#definition-CloudTypeEnum
75
+ if cloud_type not in ['ALL', 'COMMUNITY', 'SECURE']:
76
+ raise ValueError('cloud_type must be one of ALL, COMMUNITY or SECURE')
77
+
78
+ if network_volume_id and data_center_id is None:
79
+ user_info = runpod.runpod.get_user()
80
+ for network_volume in user_info['networkVolumes']:
81
+ if network_volume['id'] == network_volume_id:
82
+ data_center_id = network_volume['dataCenterId']
83
+ break
84
+
85
+ if container_disk_in_gb is None and template_id is None:
86
+ container_disk_in_gb = 10
87
+
88
+ mutation = generate_spot_pod_deployment_mutation(
89
+ name=name,
90
+ image_name=image_name,
91
+ gpu_type_id=gpu_type_id,
92
+ bid_per_gpu=bid_per_gpu,
93
+ cloud_type=cloud_type,
94
+ gpu_count=gpu_count,
95
+ min_memory_in_gb=min_memory_in_gb,
96
+ min_vcpu_count=min_vcpu_count,
97
+ container_disk_in_gb=container_disk_in_gb,
98
+ volume_in_gb=volume_in_gb,
99
+ volume_mount_path=volume_mount_path,
100
+ ports=ports,
101
+ start_ssh=start_ssh,
102
+ start_jupyter=start_jupyter,
103
+ env=env,
104
+ docker_args=docker_args,
105
+ support_public_ip=support_public_ip,
106
+ terminate_after=terminate_after,
107
+ stop_after=stop_after,
108
+ data_center_id=data_center_id,
109
+ country_code=country_code,
110
+ network_volume_id=network_volume_id,
111
+ allowed_cuda_versions=allowed_cuda_versions,
112
+ min_download=min_download,
113
+ min_upload=min_upload,
114
+ cuda_version=cuda_version,
115
+ template_id=template_id,
116
+ volume_key=volume_key,
117
+ )
118
+ response = runpod.runpod.api.graphql.run_graphql_query(mutation)
119
+ return response[_RESPONSE_DATA_FIELD][_INTERRUPTABLE_POD_FIELD]
@@ -0,0 +1,142 @@
1
+ """This module provides functions to generate GraphQL mutations for deploying
2
+ spot instance Pods on RunPod.
3
+
4
+ Reference:
5
+ https://github.com/runpod/runpod-python/blob/main/runpod/api/mutations/pods.py
6
+
7
+ Functions:
8
+ generate_spot_pod_deployment_mutation: Generates a GraphQL mutation string
9
+ for deploying a spot instance Pod on RunPod.
10
+ Example:
11
+ >>> mutation = generate_spot_pod_deployment_mutation(
12
+ name='test',
13
+ image_name='runpod/stack',
14
+ gpu_type_id='NVIDIA GeForce RTX 3070',
15
+ bid_per_gpu=0.3
16
+ )
17
+ """
18
+
19
+ from typing import List, Optional
20
+
21
+
22
+ # refer to https://graphql-spec.runpod.io/#definition-PodRentInterruptableInput
23
+ def generate_spot_pod_deployment_mutation(
24
+ name: str,
25
+ image_name: str,
26
+ gpu_type_id: str,
27
+ bid_per_gpu: float,
28
+ volume_mount_path: str,
29
+ cloud_type: str = 'ALL',
30
+ gpu_count: Optional[int] = None,
31
+ min_memory_in_gb: Optional[int] = None,
32
+ min_vcpu_count: Optional[int] = None,
33
+ container_disk_in_gb: Optional[int] = None,
34
+ volume_in_gb: Optional[int] = None,
35
+ ports: Optional[str] = None,
36
+ start_ssh: Optional[bool] = True,
37
+ start_jupyter: Optional[bool] = False,
38
+ env: Optional[dict] = None,
39
+ docker_args: Optional[str] = None,
40
+ support_public_ip: Optional[bool] = True,
41
+ terminate_after: Optional[str] = None,
42
+ stop_after: Optional[str] = None,
43
+ data_center_id: Optional[str] = None,
44
+ country_code: Optional[str] = None,
45
+ network_volume_id: Optional[str] = None,
46
+ allowed_cuda_versions: Optional[List[str]] = None,
47
+ min_download: Optional[int] = None,
48
+ min_upload: Optional[int] = None,
49
+ cuda_version: Optional[str] = None,
50
+ template_id: Optional[str] = None,
51
+ volume_key: Optional[str] = None,
52
+ ) -> str:
53
+ input_fields = []
54
+
55
+ # Required Fields
56
+ input_fields.append(f'name: "{name}"')
57
+ input_fields.append(f'imageName: "{image_name}"')
58
+ input_fields.append(f'gpuTypeId: "{gpu_type_id}"')
59
+ input_fields.append(f'bidPerGpu: {bid_per_gpu}')
60
+ input_fields.append(f'volumeMountPath: "{volume_mount_path}"')
61
+
62
+ # Default Fields
63
+ input_fields.append(f'cloudType: {cloud_type}')
64
+
65
+ if start_ssh:
66
+ input_fields.append('startSsh: true')
67
+ if start_jupyter:
68
+ input_fields.append('startJupyter: true')
69
+ if support_public_ip:
70
+ input_fields.append('supportPublicIp: true')
71
+ else:
72
+ input_fields.append('supportPublicIp: false')
73
+
74
+ # Optional Fields
75
+ if gpu_count is not None:
76
+ input_fields.append(f'gpuCount: {gpu_count}')
77
+ if min_memory_in_gb is not None:
78
+ input_fields.append(f'minMemoryInGb: {min_memory_in_gb}')
79
+ if min_vcpu_count is not None:
80
+ input_fields.append(f'minVcpuCount: {min_vcpu_count}')
81
+ if container_disk_in_gb is not None:
82
+ input_fields.append(f'containerDiskInGb: {container_disk_in_gb}')
83
+ if volume_in_gb is not None:
84
+ input_fields.append(f'volumeInGb: {volume_in_gb}')
85
+ if ports is not None:
86
+ ports = ports.replace(' ', '')
87
+ input_fields.append(f'ports: "{ports}"')
88
+ if docker_args is not None:
89
+ input_fields.append(f'dockerArgs: "{docker_args}"')
90
+ if terminate_after is not None:
91
+ input_fields.append(f'terminateAfter: "{terminate_after}"')
92
+ if stop_after is not None:
93
+ input_fields.append(f'stopAfter: "{stop_after}"')
94
+ if data_center_id is not None:
95
+ input_fields.append(f'dataCenterId: "{data_center_id}"')
96
+ if country_code is not None:
97
+ input_fields.append(f'countryCode: "{country_code}"')
98
+ if network_volume_id is not None:
99
+ input_fields.append(f'networkVolumeId: "{network_volume_id}"')
100
+ if allowed_cuda_versions is not None:
101
+ allowed_cuda_versions_string = ', '.join(
102
+ [f'"{version}"' for version in allowed_cuda_versions])
103
+ input_fields.append(
104
+ f'allowedCudaVersions: [{allowed_cuda_versions_string}]')
105
+ if min_download is not None:
106
+ input_fields.append(f'minDownload: {min_download}')
107
+ if min_upload is not None:
108
+ input_fields.append(f'minUpload: {min_upload}')
109
+ if cuda_version is not None:
110
+ input_fields.append(f'cudaVersion: "{cuda_version}"')
111
+ if template_id is not None:
112
+ input_fields.append(f'templateId: "{template_id}"')
113
+ if volume_key is not None:
114
+ input_fields.append(f'volumeKey: "{volume_key}"')
115
+
116
+ if env is not None:
117
+ env_string = ', '.join([
118
+ f'{{ key: "{key}", value: "{value}" }}'
119
+ for key, value in env.items()
120
+ ])
121
+ input_fields.append(f'env: [{env_string}]')
122
+
123
+ # Format input fields
124
+ input_string = ', '.join(input_fields)
125
+ return f"""
126
+ mutation {{
127
+ podRentInterruptable(
128
+ input: {{
129
+ {input_string}
130
+ }}
131
+ ) {{
132
+ id
133
+ desiredStatus
134
+ imageName
135
+ env
136
+ machineId
137
+ machine {{
138
+ podHostId
139
+ }}
140
+ }}
141
+ }}
142
+ """
@@ -3,24 +3,27 @@ import time
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
5
  from sky import sky_logging
6
- from sky import status_lib
7
6
  from sky.provision import common
8
7
  from sky.provision.runpod import utils
9
8
  from sky.utils import common_utils
9
+ from sky.utils import resources_utils
10
+ from sky.utils import status_lib
10
11
  from sky.utils import ux_utils
11
12
 
12
13
  POLL_INTERVAL = 5
14
+ QUERY_PORTS_TIMEOUT_SECONDS = 30
13
15
 
14
16
  logger = sky_logging.init_logger(__name__)
15
17
 
16
18
 
17
19
  def _filter_instances(cluster_name_on_cloud: str,
18
- status_filters: Optional[List[str]]) -> Dict[str, Any]:
20
+ status_filters: Optional[List[str]],
21
+ head_only: bool = False) -> Dict[str, Any]:
19
22
 
20
23
  instances = utils.list_instances()
21
- possible_names = [
22
- f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
23
- ]
24
+ possible_names = [f'{cluster_name_on_cloud}-head']
25
+ if not head_only:
26
+ possible_names.append(f'{cluster_name_on_cloud}-worker')
24
27
 
25
28
  filtered_instances = {}
26
29
  for instance_id, instance in instances.items():
@@ -80,10 +83,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,
80
83
  node_type = 'head' if head_instance_id is None else 'worker'
81
84
  try:
82
85
  instance_id = utils.launch(
83
- name=f'{cluster_name_on_cloud}-{node_type}',
86
+ cluster_name=cluster_name_on_cloud,
87
+ node_type=node_type,
84
88
  instance_type=config.node_config['InstanceType'],
85
89
  region=region,
86
- disk_size=config.node_config['DiskSize'])
90
+ disk_size=config.node_config['DiskSize'],
91
+ image_name=config.node_config['ImageId'],
92
+ ports=config.ports_to_open_on_launch,
93
+ public_key=config.node_config['PublicKey'],
94
+ preemptible=config.node_config['Preemptible'],
95
+ bid_per_gpu=config.node_config['BidPerGPU'],
96
+ docker_login_config=config.provider_config.get(
97
+ 'docker_login_config'),
98
+ )
87
99
  except Exception as e: # pylint: disable=broad-except
88
100
  logger.warning(f'run_instances error: {e}')
89
101
  raise
@@ -136,6 +148,8 @@ def terminate_instances(
136
148
  """See sky/provision/__init__.py"""
137
149
  del provider_config # unused
138
150
  instances = _filter_instances(cluster_name_on_cloud, None)
151
+ template_name, registry_auth_id = utils.get_registry_auth_resources(
152
+ cluster_name_on_cloud)
139
153
  for inst_id, inst in instances.items():
140
154
  logger.debug(f'Terminating instance {inst_id}: {inst}')
141
155
  if worker_only and inst['name'].endswith('-head'):
@@ -148,6 +162,10 @@ def terminate_instances(
148
162
  f'Failed to terminate instance {inst_id}: '
149
163
  f'{common_utils.format_exception(e, use_bracket=False)}'
150
164
  ) from e
165
+ if template_name is not None:
166
+ utils.delete_pod_template(template_name)
167
+ if registry_auth_id is not None:
168
+ utils.delete_register_auth(registry_auth_id)
151
169
 
152
170
 
153
171
  def get_cluster_info(
@@ -205,6 +223,44 @@ def query_instances(
205
223
 
206
224
  def cleanup_ports(
207
225
  cluster_name_on_cloud: str,
226
+ ports: List[str],
208
227
  provider_config: Optional[Dict[str, Any]] = None,
209
228
  ) -> None:
210
- del cluster_name_on_cloud, provider_config
229
+ del cluster_name_on_cloud, ports, provider_config # Unused.
230
+
231
+
232
+ def query_ports(
233
+ cluster_name_on_cloud: str,
234
+ ports: List[str],
235
+ head_ip: Optional[str] = None,
236
+ provider_config: Optional[Dict[str, Any]] = None,
237
+ ) -> Dict[int, List[common.Endpoint]]:
238
+ """See sky/provision/__init__.py"""
239
+ del head_ip, provider_config # Unused.
240
+ # RunPod ports sometimes take a while to be ready.
241
+ start_time = time.time()
242
+ ports_to_query = resources_utils.port_ranges_to_set(ports)
243
+ while True:
244
+ instances = _filter_instances(cluster_name_on_cloud,
245
+ None,
246
+ head_only=True)
247
+ assert len(instances) <= 1
248
+ # It is possible that the instance is terminated on console by
249
+ # the user. In this case, the instance will not be found and we
250
+ # should return an empty dict.
251
+ if not instances:
252
+ return {}
253
+ head_inst = list(instances.values())[0]
254
+ ready_ports: Dict[int, List[common.Endpoint]] = {
255
+ port: [common.SocketEndpoint(**endpoint)]
256
+ for port, endpoint in head_inst['port2endpoint'].items()
257
+ if port in ports_to_query
258
+ }
259
+ not_ready_ports = ports_to_query - set(ready_ports.keys())
260
+ if not not_ready_ports:
261
+ return ready_ports
262
+ if time.time() - start_time > QUERY_PORTS_TIMEOUT_SECONDS:
263
+ logger.warning(f'Querying ports {ports} timed out. Ports '
264
+ f'{not_ready_ports} are not ready.')
265
+ return ready_ports
266
+ time.sleep(1)