skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,383 +0,0 @@
1
- """
2
- Helper class for some OCI operations methods which needs to be shared/called
3
- by multiple places.
4
-
5
- History:
6
- - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
7
-
8
- """
9
-
10
- from datetime import datetime
11
- import logging
12
- import re
13
- import time
14
- import traceback
15
- import typing
16
- from typing import Optional
17
-
18
- from sky.adaptors import common as adaptors_common
19
- from sky.adaptors import oci as oci_adaptor
20
- from sky.clouds.utils import oci_utils
21
- from sky.skylet.providers.oci import utils
22
-
23
- if typing.TYPE_CHECKING:
24
- import pandas as pd
25
- else:
26
- pd = adaptors_common.LazyImport('pandas')
27
-
28
- logger = logging.getLogger(__name__)
29
-
30
-
31
- class oci_query_helper:
32
-
33
- # Call Cloud API to try getting the satisfied nodes.
34
- @classmethod
35
- @utils.debug_enabled(logger=logger)
36
- def query_instances_by_tags(cls, tag_filters, region):
37
-
38
- where_clause_tags = ""
39
- for tag_key in tag_filters:
40
- if where_clause_tags != "":
41
- where_clause_tags += " && "
42
-
43
- tag_value = tag_filters[tag_key]
44
- where_clause_tags += (f"(freeformTags.key = '{tag_key}'"
45
- f" && freeformTags.value = '{tag_value}')")
46
-
47
- qv_str = (f"query instance resources where {where_clause_tags}"
48
- f" && (lifecycleState != 'TERMINATED'"
49
- f" && lifecycleState != 'TERMINATING')")
50
-
51
- qv = oci_adaptor.oci.resource_search.models.StructuredSearchDetails(
52
- query=qv_str,
53
- type="Structured",
54
- matching_context_type=oci_adaptor.oci.resource_search.models.
55
- SearchDetails.MATCHING_CONTEXT_TYPE_NONE,
56
- )
57
-
58
- list_instances_response = oci_adaptor.get_search_client(
59
- region, oci_utils.oci_config.get_profile()).search_resources(qv)
60
- result_set = list_instances_response.data.items
61
-
62
- return result_set
63
-
64
- @classmethod
65
- def terminate_instances_by_tags(cls, tag_filters, region) -> int:
66
- logger.debug(f"Terminate instance by tags: {tag_filters}")
67
- insts = cls.query_instances_by_tags(tag_filters, region)
68
- fail_count = 0
69
- for inst in insts:
70
- inst_id = inst.identifier
71
- logger.debug(f"Got instance(to be terminated): {inst_id}")
72
-
73
- try:
74
- oci_adaptor.get_core_client(
75
- region,
76
- oci_utils.oci_config.get_profile()).terminate_instance(
77
- inst_id)
78
- except Exception as e:
79
- fail_count += 1
80
- logger.error(f"Terminate instance failed: {str(e)}\n: {inst}")
81
- traceback.print_exc()
82
-
83
- if fail_count == 0:
84
- logger.debug(f"Instance teardown result: OK")
85
- else:
86
- logger.warn(f"Instance teardown result: {fail_count} failed!")
87
-
88
- return fail_count
89
-
90
- @classmethod
91
- @utils.debug_enabled(logger=logger)
92
- def subscribe_image(cls, compartment_id, listing_id, resource_version,
93
- region):
94
- if (pd.isna(listing_id) or listing_id.strip() == "None" or
95
- listing_id.strip() == "nan"):
96
- return
97
-
98
- core_client = oci_adaptor.get_core_client(
99
- region, oci_utils.oci_config.get_profile())
100
- try:
101
- agreements_response = core_client.get_app_catalog_listing_agreements(
102
- listing_id=listing_id, resource_version=resource_version)
103
- agreements = agreements_response.data
104
-
105
- core_client.create_app_catalog_subscription(
106
- create_app_catalog_subscription_details=oci_adaptor.oci.core.
107
- models.CreateAppCatalogSubscriptionDetails(
108
- compartment_id=compartment_id,
109
- listing_id=listing_id,
110
- listing_resource_version=agreements.
111
- listing_resource_version,
112
- oracle_terms_of_use_link=agreements.
113
- oracle_terms_of_use_link,
114
- time_retrieved=datetime.strptime(
115
- re.sub(
116
- "\d{3}\+\d{2}\:\d{2}",
117
- "Z",
118
- str(agreements.time_retrieved),
119
- 0,
120
- ),
121
- "%Y-%m-%d %H:%M:%S.%fZ",
122
- ),
123
- signature=agreements.signature,
124
- eula_link=agreements.eula_link,
125
- ))
126
- except Exception as e:
127
- logger.critical(
128
- f"subscribe_image: {listing_id} - {resource_version} ... [Failed]"
129
- f"Error message: {str(e)}")
130
- raise RuntimeError("ERR: Image subscription error!")
131
-
132
- @classmethod
133
- @utils.debug_enabled(logger=logger)
134
- def find_compartment(cls, region) -> str:
135
- """ If compartment is not configured, we use root compartment """
136
- # Try to use the configured one first
137
- skypilot_compartment = oci_utils.oci_config.get_compartment(region)
138
- if skypilot_compartment is not None:
139
- return skypilot_compartment
140
-
141
- # If not specified, we try to find the one skypilot-compartment
142
- # Pass-in a profile parameter so that multiple profile in oci
143
- # config file is supported (2023/06/09).
144
- root = oci_adaptor.get_oci_config(
145
- region, oci_utils.oci_config.get_profile())['tenancy']
146
- list_compartments_response = oci_adaptor.get_identity_client(
147
- region, oci_utils.oci_config.get_profile()).list_compartments(
148
- compartment_id=root,
149
- name=oci_utils.oci_config.COMPARTMENT,
150
- lifecycle_state='ACTIVE',
151
- limit=1)
152
- compartments = list_compartments_response.data
153
- if len(compartments) > 0:
154
- skypilot_compartment = compartments[0].id
155
- return skypilot_compartment
156
-
157
- # Finally, we use root compartment none matches above
158
- skypilot_compartment = root
159
- return skypilot_compartment
160
-
161
- @classmethod
162
- @utils.debug_enabled(logger=logger)
163
- def find_create_vcn_subnet(cls, region) -> Optional[str]:
164
- """ If sub is not configured, we find/create VCN skypilot_vcn """
165
- subnet = oci_utils.oci_config.get_vcn_subnet(region)
166
- if subnet is not None:
167
- # User explicitly specified the subnet in sky config.
168
- return subnet
169
-
170
- # Try to reuse the skypilot_vcn.
171
- net_client = oci_adaptor.get_net_client(
172
- region, oci_utils.oci_config.get_profile())
173
- skypilot_compartment = cls.find_compartment(region)
174
- list_vcns_response = net_client.list_vcns(
175
- compartment_id=skypilot_compartment,
176
- display_name=oci_utils.oci_config.VCN_NAME,
177
- lifecycle_state="AVAILABLE")
178
- vcns = list_vcns_response.data
179
- if len(vcns) > 0:
180
- # Found the VCN.
181
- skypilot_vcn = vcns[0].id
182
- list_subnets_response = net_client.list_subnets(
183
- compartment_id=skypilot_compartment,
184
- limit=1,
185
- vcn_id=skypilot_vcn,
186
- display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
187
- lifecycle_state="AVAILABLE")
188
- logger.debug(f'Got VCN subnet \n{list_subnets_response.data}')
189
- if len(list_subnets_response.data) < 1:
190
- logger.error(
191
- f'No subnet {oci_utils.oci_config.VCN_SUBNET_NAME} '
192
- f'found in the VCN {oci_utils.oci_config.VCN_NAME}')
193
- raise RuntimeError(
194
- f'VcnSubnetNotFound Error: No subnet '
195
- f'{oci_utils.oci_config.VCN_SUBNET_NAME} found in '
196
- f'the VCN {oci_utils.oci_config.VCN_NAME}')
197
- subnet = list_subnets_response.data[0].id
198
- return subnet
199
- else:
200
- # Create the skypilot_vcn and related resources
201
- return cls.create_vcn_subnet(net_client, skypilot_compartment)
202
-
203
- @classmethod
204
- @utils.debug_enabled(logger=logger)
205
- def create_vcn_subnet(cls, net_client,
206
- skypilot_compartment) -> Optional[str]:
207
- try:
208
- create_vcn_response = net_client.create_vcn(
209
- create_vcn_details=oci_adaptor.oci.core.models.CreateVcnDetails(
210
- compartment_id=skypilot_compartment,
211
- cidr_blocks=[oci_utils.oci_config.VCN_CIDR],
212
- display_name=oci_utils.oci_config.VCN_NAME,
213
- is_ipv6_enabled=False,
214
- dns_label=oci_utils.oci_config.VCN_DNS_LABEL))
215
- vcn_data = create_vcn_response.data
216
- logger.debug(f'Created VCN \n{vcn_data}')
217
- skypilot_vcn = vcn_data.id
218
- route_table = vcn_data.default_route_table_id
219
- security_list = vcn_data.default_security_list_id
220
- dhcp_options_id = vcn_data.default_dhcp_options_id
221
-
222
- # Create internet gateway for internet access
223
- create_ig_response = net_client.create_internet_gateway(
224
- create_internet_gateway_details=oci_adaptor.oci.core.models.
225
- CreateInternetGatewayDetails(
226
- compartment_id=skypilot_compartment,
227
- is_enabled=True,
228
- vcn_id=skypilot_vcn,
229
- display_name=oci_utils.oci_config.VCN_INTERNET_GATEWAY_NAME
230
- ))
231
- logger.debug(
232
- f'Created internet gateway \n{create_ig_response.data}')
233
- ig = create_ig_response.data.id
234
-
235
- # Create a public subnet.
236
- create_subnet_response = net_client.create_subnet(
237
- create_subnet_details=oci_adaptor.oci.core.models.
238
- CreateSubnetDetails(
239
- cidr_block=oci_utils.oci_config.VCN_SUBNET_CIDR,
240
- compartment_id=skypilot_compartment,
241
- vcn_id=skypilot_vcn,
242
- dhcp_options_id=dhcp_options_id,
243
- display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
244
- prohibit_internet_ingress=False,
245
- prohibit_public_ip_on_vnic=False,
246
- route_table_id=route_table,
247
- security_list_ids=[security_list]))
248
- logger.debug(f'Created subnet \n{create_subnet_response.data}')
249
- subnet = create_subnet_response.data.id
250
-
251
- list_services_response = net_client.list_services(limit=100)
252
- services = [
253
- s for s in list_services_response.data
254
- if str(s.cidr_block).startswith('all-') and str(s.cidr_block).
255
- endswith('-services-in-oracle-services-network')
256
- ]
257
- if len(services) > 0:
258
- # Create service gateway for regional services.
259
- create_sg_response = net_client.create_service_gateway(
260
- create_service_gateway_details=oci_adaptor.oci.core.models.
261
- CreateServiceGatewayDetails(
262
- compartment_id=skypilot_compartment,
263
- services=[
264
- oci_adaptor.oci.core.models.ServiceIdRequestDetails(
265
- service_id=services[0].id)
266
- ],
267
- vcn_id=skypilot_vcn))
268
- logger.debug(f'Service Gateway: \n{create_sg_response.data}')
269
- sg = create_sg_response.data.id
270
-
271
- # Update security list: Allow all traffic in the same subnet
272
- update_security_list_response = net_client.update_security_list(
273
- security_list_id=security_list,
274
- update_security_list_details=oci_adaptor.oci.core.models.
275
- UpdateSecurityListDetails(ingress_security_rules=[
276
- oci_adaptor.oci.core.models.IngressSecurityRule(
277
- protocol="6",
278
- source=oci_utils.oci_config.VCN_CIDR_INTERNET,
279
- is_stateless=False,
280
- source_type="CIDR_BLOCK",
281
- tcp_options=oci_adaptor.oci.core.models.TcpOptions(
282
- destination_port_range=oci_adaptor.oci.core.models.
283
- PortRange(max=22, min=22),
284
- source_port_range=oci_adaptor.oci.core.models.
285
- PortRange(max=65535, min=1)),
286
- description="Allow SSH port."),
287
- oci_adaptor.oci.core.models.IngressSecurityRule(
288
- protocol="all",
289
- source=oci_utils.oci_config.VCN_SUBNET_CIDR,
290
- is_stateless=False,
291
- source_type="CIDR_BLOCK",
292
- description="Allow all traffic from/to same subnet."),
293
- oci_adaptor.oci.core.models.IngressSecurityRule(
294
- protocol="1",
295
- source=oci_utils.oci_config.VCN_CIDR_INTERNET,
296
- is_stateless=False,
297
- source_type="CIDR_BLOCK",
298
- icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
299
- type=3, code=4),
300
- description="ICMP traffic."),
301
- oci_adaptor.oci.core.models.IngressSecurityRule(
302
- protocol="1",
303
- source=oci_utils.oci_config.VCN_CIDR,
304
- is_stateless=False,
305
- source_type="CIDR_BLOCK",
306
- icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
307
- type=3),
308
- description="ICMP traffic (VCN)."),
309
- ]))
310
- logger.debug(
311
- f'Updated security_list: \n{update_security_list_response.data}'
312
- )
313
-
314
- # Update route table: bind to the internet gateway
315
- update_route_table_response = net_client.update_route_table(
316
- rt_id=route_table,
317
- update_route_table_details=oci_adaptor.oci.core.models.
318
- UpdateRouteTableDetails(route_rules=[
319
- oci_adaptor.oci.core.models.RouteRule(
320
- network_entity_id=create_ig_response.data.id,
321
- destination='0.0.0.0/0',
322
- destination_type='CIDR_BLOCK',
323
- description='Route table for SkyPilot VCN',
324
- route_type='STATIC')
325
- ]))
326
- logger.debug(f'Route table: \n{update_route_table_response.data}')
327
-
328
- except oci_adaptor.service_exception() as e:
329
- logger.error(f'Create VCN Error: Create new VCN '
330
- f'{oci_utils.oci_config.VCN_NAME} failed: {str(e)}')
331
- # In case of partial success while creating vcn
332
- cls.delete_vcn(net_client, skypilot_vcn, subnet, ig, sg)
333
- subnet = None
334
-
335
- return subnet
336
-
337
- @classmethod
338
- @utils.debug_enabled(logger=logger)
339
- def delete_vcn(cls, net_client, skypilot_vcn, skypilot_subnet,
340
- internet_gateway, service_gateway):
341
- if skypilot_vcn is None:
342
- return # Nothing to delete
343
- try:
344
- if internet_gateway is not None:
345
- # Delete internet gateway
346
- delete_ig_response = net_client.delete_internet_gateway(
347
- ig_id=internet_gateway)
348
- logger.debug(f'Deleted internet gateway {internet_gateway}'
349
- f'-{delete_ig_response.data}')
350
- if service_gateway is not None:
351
- # Delete service gateway
352
- delete_sg_response = net_client.delete_service_gateway(
353
- service_gateway_id=service_gateway)
354
- logger.debug(f'Deleted service gateway {service_gateway}'
355
- f'-{delete_sg_response.data}')
356
- if skypilot_subnet is not None:
357
- # Delete subnet
358
- delete_subnet_response = net_client.delete_subnet(
359
- subnet_id=skypilot_subnet)
360
- logger.debug(f'Deleted subnet {skypilot_subnet}'
361
- f'-{delete_subnet_response.data}')
362
- # Delete vcn
363
- retry_count = 0
364
- while retry_count < oci_utils.oci_config.MAX_RETRY_COUNT:
365
- try:
366
- delete_vcn_response = net_client.delete_vcn(
367
- vcn_id=skypilot_vcn)
368
- logger.debug(
369
- f'Deleted vcn {skypilot_vcn}-{delete_vcn_response.data}'
370
- )
371
- break
372
- except oci_adaptor.service_exception() as e:
373
- logger.info(f'Waiting del SG/IG/Subnet finish: {str(e)}')
374
- retry_count = retry_count + 1
375
- if retry_count == oci_utils.oci_config.MAX_RETRY_COUNT:
376
- raise e
377
- else:
378
- time.sleep(
379
- oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS)
380
-
381
- except oci_adaptor.service_exception() as e:
382
- logger.error(
383
- f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
@@ -1,21 +0,0 @@
1
- from datetime import datetime
2
- import functools
3
- from logging import Logger
4
-
5
-
6
- def debug_enabled(logger: Logger):
7
-
8
- def decorate(f):
9
-
10
- @functools.wraps(f)
11
- def wrapper(*args, **kwargs):
12
- dt_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
13
- logger.debug(f"{dt_str} Enter {f}, {args}, {kwargs}")
14
- try:
15
- return f(*args, **kwargs)
16
- finally:
17
- logger.debug(f"{dt_str} Exit {f}")
18
-
19
- return wrapper
20
-
21
- return decorate
@@ -1,24 +0,0 @@
1
- """Utility functions for cluster yaml file."""
2
-
3
- import re
4
-
5
- # The cluster yaml used to create the current cluster where the module is
6
- # called.
7
- SKY_CLUSTER_YAML_REMOTE_PATH = '~/.sky/sky_ray.yml'
8
-
9
-
10
- def get_provider_name(config: dict) -> str:
11
- """Return the name of the provider."""
12
-
13
- provider_module = config['provider']['module']
14
- # Examples:
15
- # 'sky.skylet.providers.aws.AWSNodeProviderV2' -> 'aws'
16
- # 'sky.provision.aws' -> 'aws'
17
- provider_search = re.search(r'(?:providers|provision)\.(\w+)\.?',
18
- provider_module)
19
- assert provider_search is not None, config
20
- provider_name = provider_search.group(1).lower()
21
- # Special handling for lambda_cloud as Lambda cloud is registered as lambda.
22
- if provider_name == 'lambda_cloud':
23
- provider_name = 'lambda'
24
- return provider_name
@@ -1,137 +0,0 @@
1
- #!/bin/bash
2
- # This script creates a new k8s Service Account and generates a kubeconfig with
3
- # its credentials. This Service Account has all the necessary permissions for
4
- # SkyPilot. The kubeconfig is written in the current directory.
5
- #
6
- # You must configure your local kubectl to point to the right k8s cluster and
7
- # have admin-level access.
8
- #
9
- # Note: all of the k8s resources are created in namespace "skypilot". If you
10
- # delete any of these objects, SkyPilot will stop working.
11
- #
12
- # You can override the default namespace "skypilot" using the
13
- # SKYPILOT_NAMESPACE environment variable.
14
- # You can override the default service account name "skypilot-sa" using the
15
- # SKYPILOT_SA_NAME environment variable.
16
-
17
- set -eu -o pipefail
18
-
19
- # Allow passing in common name and username in environment. If not provided,
20
- # use default.
21
- SKYPILOT_SA=${SKYPILOT_SA_NAME:-skypilot-sa}
22
- NAMESPACE=${SKYPILOT_NAMESPACE:-default}
23
-
24
- # Set OS specific values.
25
- if [[ "$OSTYPE" == "linux-gnu" ]]; then
26
- BASE64_DECODE_FLAG="-d"
27
- elif [[ "$OSTYPE" == "darwin"* ]]; then
28
- BASE64_DECODE_FLAG="-D"
29
- elif [[ "$OSTYPE" == "linux-musl" ]]; then
30
- BASE64_DECODE_FLAG="-d"
31
- else
32
- echo "Unknown OS ${OSTYPE}"
33
- exit 1
34
- fi
35
-
36
- echo "Creating the Kubernetes Service Account with minimal RBAC permissions."
37
- kubectl apply -f - <<EOF
38
- apiVersion: v1
39
- kind: Namespace
40
- metadata:
41
- name: ${NAMESPACE}
42
- ---
43
- apiVersion: v1
44
- kind: ServiceAccount
45
- metadata:
46
- name: ${SKYPILOT_SA}
47
- namespace: ${NAMESPACE}
48
- ---
49
- apiVersion: rbac.authorization.k8s.io/v1
50
- kind: ClusterRole
51
- metadata:
52
- name: skypilot-role
53
- rules:
54
- - apiGroups: ["*"]
55
- resources: ["*"]
56
- verbs: ["*"]
57
- ---
58
- apiVersion: rbac.authorization.k8s.io/v1
59
- kind: ClusterRoleBinding
60
- metadata:
61
- name: skypilot-crb
62
- roleRef:
63
- apiGroup: rbac.authorization.k8s.io
64
- kind: ClusterRole
65
- name: skypilot-role
66
- subjects:
67
- - kind: ServiceAccount
68
- name: ${SKYPILOT_SA}
69
- namespace: ${NAMESPACE}
70
- EOF
71
-
72
- # Checks if secret entry was defined for Service account. If defined it means that Kubernetes server has a
73
- # version bellow 1.24, otherwise one must manually create the secret and bind it to the Service account to have a non expiring token.
74
- # After Kubernetes v1.24 Service accounts no longer generate automatic tokens/secrets.
75
- # We can use kubectl create token but the token has a expiration time.
76
- # https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.24.md#urgent-upgrade-notes
77
- SA_SECRET_NAME=$(kubectl get -n ${NAMESPACE} sa/${SKYPILOT_SA} -o "jsonpath={.secrets[0]..name}")
78
- if [ -z $SA_SECRET_NAME ]
79
- then
80
- # Create the secret and bind it to the desired SA
81
- kubectl apply -f - <<EOF
82
- apiVersion: v1
83
- kind: Secret
84
- type: kubernetes.io/service-account-token
85
- metadata:
86
- name: ${SKYPILOT_SA}
87
- namespace: ${NAMESPACE}
88
- annotations:
89
- kubernetes.io/service-account.name: "${SKYPILOT_SA}"
90
- EOF
91
-
92
- SA_SECRET_NAME=${SKYPILOT_SA}
93
- fi
94
-
95
- # Note: service account token is stored base64-encoded in the secret but must
96
- # be plaintext in kubeconfig.
97
- SA_TOKEN=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['token']}" | base64 ${BASE64_DECODE_FLAG})
98
- CA_CERT=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['ca\.crt']}")
99
-
100
- # Extract cluster IP from the current context
101
- CURRENT_CONTEXT=$(kubectl config current-context)
102
- CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"${CURRENT_CONTEXT}\"})].context.cluster}")
103
- CURRENT_CLUSTER_ADDR=$(kubectl config view -o jsonpath="{.clusters[?(@.name == \"${CURRENT_CLUSTER}\"})].cluster.server}")
104
-
105
- echo "Writing kubeconfig."
106
- cat > kubeconfig <<EOF
107
- apiVersion: v1
108
- clusters:
109
- - cluster:
110
- certificate-authority-data: ${CA_CERT}
111
- server: ${CURRENT_CLUSTER_ADDR}
112
- name: ${CURRENT_CLUSTER}
113
- contexts:
114
- - context:
115
- cluster: ${CURRENT_CLUSTER}
116
- user: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
117
- name: ${CURRENT_CONTEXT}
118
- current-context: ${CURRENT_CONTEXT}
119
- kind: Config
120
- preferences: {}
121
- users:
122
- - name: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
123
- user:
124
- token: ${SA_TOKEN}
125
- EOF
126
-
127
- echo "---
128
- Done!
129
-
130
- Copy the generated kubeconfig file to your SkyPilot Proxy server, and set the
131
- kubeconfig_file parameter in your skypilot.yaml config file to point to this
132
- kubeconfig file.
133
-
134
- If you need access to multiple kubernetes clusters, you can generate additional
135
- kubeconfig files using this script and then merge them using merge-kubeconfigs.sh.
136
-
137
- Note: Kubernetes RBAC rules for SkyPilot were created, you won't need to create them manually."