skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/skypilot_config.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """Immutable user configurations (EXPERIMENTAL).
2
2
 
3
- On module import, we attempt to parse the config located at CONFIG_PATH. Caller
4
- can then use
3
+ On module import, we attempt to parse the config located at CONFIG_PATH
4
+ (default: ~/.sky/config.yaml). Caller can then use
5
5
 
6
6
  >> skypilot_config.loaded()
7
7
 
@@ -11,6 +11,13 @@ To read a nested-key config:
11
11
 
12
12
  >> skypilot_config.get_nested(('auth', 'some_auth_config'), default_value)
13
13
 
14
+ The config can be overridden by the configs in task YAMLs. Callers are
15
+ responsible to provide the override_configs. If the nested key is part of
16
+ OVERRIDEABLE_CONFIG_KEYS, override_configs must be provided (can be empty):
17
+
18
+ >> skypilot_config.get_nested(('docker', 'run_options'), default_value
19
+ override_configs={'docker': {'run_options': 'value'}})
20
+
14
21
  To set a value in the nested-key config:
15
22
 
16
23
  >> config_dict = skypilot_config.set_nested(('auth', 'some_key'), value)
@@ -41,18 +48,25 @@ then:
41
48
  skypilot_config.get_nested(('a', 'nonexist'), None) # ==> None
42
49
  skypilot_config.get_nested(('a',), None) # ==> None
43
50
  """
51
+ import contextlib
44
52
  import copy
45
53
  import os
46
54
  import pprint
47
- from typing import Any, Dict, Iterable, Optional
55
+ import tempfile
56
+ from typing import Any, Dict, Iterator, Optional, Tuple
48
57
 
49
58
  import yaml
50
59
 
60
+ from sky import exceptions
51
61
  from sky import sky_logging
62
+ from sky.skylet import constants
52
63
  from sky.utils import common_utils
64
+ from sky.utils import config_utils
53
65
  from sky.utils import schemas
54
66
  from sky.utils import ux_utils
55
67
 
68
+ logger = sky_logging.init_logger(__name__)
69
+
56
70
  # The config path is discovered in this order:
57
71
  #
58
72
  # (1) (Used internally) If env var {ENV_VAR_SKYPILOT_CONFIG} exists, use its
@@ -65,68 +79,65 @@ from sky.utils import ux_utils
65
79
  # (Used internally) An env var holding the path to the local config file. This
66
80
  # is only used by jobs controller tasks to ensure recoveries of the same job
67
81
  # use the same config file.
68
- ENV_VAR_SKYPILOT_CONFIG = 'SKYPILOT_CONFIG'
82
+ ENV_VAR_SKYPILOT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}CONFIG'
69
83
 
70
84
  # Path to the local config file.
71
85
  CONFIG_PATH = '~/.sky/config.yaml'
72
86
 
73
- logger = sky_logging.init_logger(__name__)
74
-
75
87
  # The loaded config.
76
- _dict = None
77
- _loaded_config_path = None
88
+ _dict = config_utils.Config()
89
+ _loaded_config_path: Optional[str] = None
78
90
 
79
91
 
80
- def get_nested(keys: Iterable[str], default_value: Any) -> Any:
92
+ def get_nested(keys: Tuple[str, ...],
93
+ default_value: Any,
94
+ override_configs: Optional[Dict[str, Any]] = None) -> Any:
81
95
  """Gets a nested key.
82
96
 
83
97
  If any key is not found, or any intermediate key does not point to a dict
84
98
  value, returns 'default_value'.
99
+
100
+ When 'keys' is within OVERRIDEABLE_CONFIG_KEYS, 'override_configs' must be
101
+ provided (can be empty). Otherwise, 'override_configs' must not be provided.
102
+
103
+ Args:
104
+ keys: A tuple of strings representing the nested keys.
105
+ default_value: The default value to return if the key is not found.
106
+ override_configs: A dict of override configs with the same schema as
107
+ the config file, but only containing the keys to override.
108
+
109
+ Returns:
110
+ The value of the nested key, or 'default_value' if not found.
85
111
  """
86
- if _dict is None:
87
- return default_value
88
- curr = _dict
89
- for key in keys:
90
- if isinstance(curr, dict) and key in curr:
91
- curr = curr[key]
92
- else:
93
- return default_value
94
- logger.debug(f'User config: {".".join(keys)} -> {curr}')
95
- return curr
112
+ return _dict.get_nested(
113
+ keys,
114
+ default_value,
115
+ override_configs,
116
+ allowed_override_keys=constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK,
117
+ disallowed_override_keys=None)
96
118
 
97
119
 
98
- def set_nested(keys: Iterable[str], value: Any) -> Dict[str, Any]:
120
+ def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
99
121
  """Returns a deep-copied config with the nested key set to value.
100
122
 
101
123
  Like get_nested(), if any key is not found, this will not raise an error.
102
124
  """
103
- _check_loaded_or_die()
104
- assert _dict is not None
105
- curr = copy.deepcopy(_dict)
106
- to_return = curr
107
- prev = None
108
- for i, key in enumerate(keys):
109
- if key not in curr:
110
- curr[key] = {}
111
- prev = curr
112
- curr = curr[key]
113
- if i == len(keys) - 1:
114
- prev_value = prev[key]
115
- prev[key] = value
116
- logger.debug(f'Set the value of {keys} to {value} (previous: '
117
- f'{prev_value}). Returning conf: {to_return}')
118
- return to_return
119
-
120
-
121
- def to_dict() -> Dict[str, Any]:
125
+ copied_dict = copy.deepcopy(_dict)
126
+ copied_dict.set_nested(keys, value)
127
+ return dict(**copied_dict)
128
+
129
+
130
+ def to_dict() -> config_utils.Config:
122
131
  """Returns a deep-copied version of the current config."""
123
- if _dict is not None:
124
- return copy.deepcopy(_dict)
125
- return {}
132
+ return copy.deepcopy(_dict)
126
133
 
127
134
 
128
- def _try_load_config() -> None:
135
+ def _reload_config() -> None:
129
136
  global _dict, _loaded_config_path
137
+ # Reset the global variables, to avoid using stale values.
138
+ _dict = config_utils.Config()
139
+ _loaded_config_path = None
140
+
130
141
  config_path_via_env_var = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
131
142
  if config_path_via_env_var is not None:
132
143
  config_path = os.path.expanduser(config_path_via_env_var)
@@ -142,18 +153,19 @@ def _try_load_config() -> None:
142
153
  config_path = os.path.expanduser(config_path)
143
154
  if os.path.exists(config_path):
144
155
  logger.debug(f'Using config path: {config_path}')
145
- _loaded_config_path = config_path
146
156
  try:
147
- _dict = common_utils.read_yaml(config_path)
157
+ config = common_utils.read_yaml(config_path)
158
+ _dict = config_utils.Config.from_dict(config)
159
+ _loaded_config_path = config_path
148
160
  logger.debug(f'Config loaded:\n{pprint.pformat(_dict)}')
149
161
  except yaml.YAMLError as e:
150
162
  logger.error(f'Error in loading config file ({config_path}):', e)
151
- if _dict is not None:
163
+ if _dict:
152
164
  common_utils.validate_schema(
153
165
  _dict,
154
166
  schemas.get_config_schema(),
155
167
  f'Invalid config YAML ({config_path}). See: '
156
- 'https://skypilot.readthedocs.io/en/latest/reference/config.html. ' # pylint: disable=line-too-long
168
+ 'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long
157
169
  'Error: ',
158
170
  skip_none=False)
159
171
 
@@ -166,17 +178,67 @@ def loaded_config_path() -> Optional[str]:
166
178
 
167
179
 
168
180
  # Load on import.
169
- _try_load_config()
170
-
171
-
172
- def _check_loaded_or_die():
173
- """Checks loaded() is true; otherwise raises RuntimeError."""
174
- if _dict is None:
175
- raise RuntimeError(
176
- f'No user configs loaded. Check {CONFIG_PATH} exists and '
177
- 'can be loaded.')
181
+ _reload_config()
178
182
 
179
183
 
180
184
  def loaded() -> bool:
181
185
  """Returns if the user configurations are loaded."""
182
- return _dict is not None
186
+ return bool(_dict)
187
+
188
+
189
+ @contextlib.contextmanager
190
+ def override_skypilot_config(
191
+ override_configs: Optional[Dict[str, Any]]) -> Iterator[None]:
192
+ """Overrides the user configurations."""
193
+ # TODO(SKY-1215): allow admin user to extend the disallowed keys or specify
194
+ # allowed keys.
195
+ if not override_configs:
196
+ # If no override configs (None or empty dict), do nothing.
197
+ yield
198
+ return
199
+ original_env_config_path = _loaded_config_path
200
+ original_config = dict(_dict)
201
+ config = _dict.get_nested(
202
+ keys=tuple(),
203
+ default_value=None,
204
+ override_configs=override_configs,
205
+ allowed_override_keys=None,
206
+ disallowed_override_keys=constants.SKIPPED_CLIENT_OVERRIDE_KEYS)
207
+ with tempfile.NamedTemporaryFile(
208
+ mode='w',
209
+ prefix='skypilot_config',
210
+ # Have to avoid deleting the file as the underlying function needs
211
+ # to read the config file, and we need to close the file mode='w'
212
+ # to enable reading.
213
+ delete=False) as f:
214
+ common_utils.dump_yaml(f.name, dict(config))
215
+ os.environ[ENV_VAR_SKYPILOT_CONFIG] = f.name
216
+ try:
217
+ _reload_config()
218
+ yield
219
+ except exceptions.InvalidSkyPilotConfigError as e:
220
+ with ux_utils.print_exception_no_traceback():
221
+ raise exceptions.InvalidSkyPilotConfigError(
222
+ 'Failed to override the SkyPilot config on API '
223
+ 'server with your local SkyPilot config:\n'
224
+ '=== SkyPilot config on API server ===\n'
225
+ f'{common_utils.dump_yaml_str(original_config)}\n'
226
+ '=== Your local SkyPilot config ===\n'
227
+ f'{common_utils.dump_yaml_str(override_configs)}\n'
228
+ f'Details: {e}') from e
229
+
230
+ finally:
231
+ if original_env_config_path is not None:
232
+ os.environ[ENV_VAR_SKYPILOT_CONFIG] = original_env_config_path
233
+ else:
234
+ os.environ.pop(ENV_VAR_SKYPILOT_CONFIG, None)
235
+ # Reload the config to restore the original config to avoid the next
236
+ # request reusing the same process to use the config for the current
237
+ # request.
238
+ _reload_config()
239
+
240
+ try:
241
+ os.remove(f.name)
242
+ except Exception: # pylint: disable=broad-except
243
+ # Failing to delete the file is not critical.
244
+ pass
sky/task.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Task: a coarse-grained stage in an application."""
2
+ import collections
2
3
  import inspect
3
4
  import json
4
5
  import os
@@ -121,6 +122,9 @@ def _check_docker_login_config(task_envs: Dict[str, str]) -> bool:
121
122
 
122
123
  If any of the docker login env vars is set, all of them must be set.
123
124
 
125
+ Returns:
126
+ True if there is a valid docker login config in task_envs.
127
+ False otherwise.
124
128
  Raises:
125
129
  ValueError: if any of the docker login env vars is set, but not all of
126
130
  them are set.
@@ -168,6 +172,23 @@ def _with_docker_login_config(
168
172
  return type(resources)(new_resources)
169
173
 
170
174
 
175
+ def _with_docker_username_for_runpod(
176
+ resources: Union[Set['resources_lib.Resources'],
177
+ List['resources_lib.Resources']],
178
+ task_envs: Dict[str, str],
179
+ ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
180
+ docker_username_for_runpod = task_envs.get(
181
+ constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
182
+
183
+ # We should not call r.copy() if docker_username_for_runpod is None,
184
+ # to prevent `DummyResources` instance becoming a `Resources` instance.
185
+ if docker_username_for_runpod is None:
186
+ return resources
187
+ return (type(resources)(
188
+ r.copy(_docker_username_for_runpod=docker_username_for_runpod)
189
+ for r in resources))
190
+
191
+
171
192
  class Task:
172
193
  """Task: a computation to be run on the cloud."""
173
194
 
@@ -184,6 +205,8 @@ class Task:
184
205
  docker_image: Optional[str] = None,
185
206
  event_callback: Optional[str] = None,
186
207
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
208
+ # Internal use only.
209
+ file_mounts_mapping: Optional[Dict[str, str]] = None,
187
210
  ):
188
211
  """Initializes a Task.
189
212
 
@@ -280,21 +303,33 @@ class Task:
280
303
 
281
304
  # Filled in by the optimizer. If None, this Task is not planned.
282
305
  self.best_resources = None
283
- # Check if the task is legal.
284
- self._validate()
306
+
307
+ # For internal use only.
308
+ self.file_mounts_mapping = file_mounts_mapping
285
309
 
286
310
  dag = sky.dag.get_current_dag()
287
311
  if dag is not None:
288
312
  dag.add(self)
289
313
 
290
- def _validate(self):
291
- """Checks if the Task fields are valid."""
314
+ def validate(self, workdir_only: bool = False):
315
+ """Validate all fields of the task."""
316
+ self.validate_name()
317
+ self.validate_run()
318
+ self.expand_and_validate_workdir()
319
+ if not workdir_only:
320
+ self.expand_and_validate_file_mounts()
321
+ for r in self.resources:
322
+ r.validate()
323
+
324
+ def validate_name(self):
325
+ """Validates if the task name is valid."""
292
326
  if not _is_valid_name(self.name):
293
327
  with ux_utils.print_exception_no_traceback():
294
328
  raise ValueError(f'Invalid task name {self.name}. Valid name: '
295
329
  f'{_VALID_NAME_DESCR}')
296
330
 
297
- # Check self.run
331
+ def validate_run(self):
332
+ """Validates if the run command is valid."""
298
333
  if callable(self.run):
299
334
  run_sig = inspect.signature(self.run)
300
335
  # Check that run is a function with 2 arguments.
@@ -333,15 +368,65 @@ class Task:
333
368
  f'a command generator ({CommandGen}). '
334
369
  f'Got {type(self.run)}')
335
370
 
336
- # Workdir.
337
- if self.workdir is not None:
338
- full_workdir = os.path.abspath(os.path.expanduser(self.workdir))
339
- if not os.path.isdir(full_workdir):
340
- # Symlink to a dir is legal (isdir() follows symlinks).
371
+ def expand_and_validate_file_mounts(self):
372
+ """Expand file_mounts paths to absolute paths and validate them.
373
+
374
+ Note: if this function is called on a remote SkyPilot API server,
375
+ it must be after the client side has sync-ed all files to the
376
+ remote server.
377
+ """
378
+ if self.file_mounts is None:
379
+ return
380
+ for target, source in self.file_mounts.items():
381
+ if target.endswith('/') or source.endswith('/'):
382
+ with ux_utils.print_exception_no_traceback():
383
+ raise ValueError(
384
+ 'File mount paths cannot end with a slash '
385
+ '(try "/mydir: /mydir" or "/myfile: /myfile"). '
386
+ f'Found: target={target} source={source}')
387
+ if data_utils.is_cloud_store_url(target):
388
+ with ux_utils.print_exception_no_traceback():
389
+ raise ValueError(
390
+ 'File mount destination paths cannot be cloud storage')
391
+ if not data_utils.is_cloud_store_url(source):
392
+ self.file_mounts[target] = os.path.abspath(
393
+ os.path.expanduser(source))
394
+ if not os.path.exists(self.file_mounts[target]
395
+ ) and not source.startswith('skypilot:'):
396
+ with ux_utils.print_exception_no_traceback():
397
+ raise ValueError(
398
+ f'File mount source {source!r} does not exist '
399
+ 'locally. To fix: check if it exists, and correct '
400
+ 'the path.')
401
+ # TODO(zhwu): /home/username/sky_workdir as the target path need
402
+ # to be filtered out as well.
403
+ if (target == constants.SKY_REMOTE_WORKDIR and
404
+ self.workdir is not None):
341
405
  with ux_utils.print_exception_no_traceback():
342
406
  raise ValueError(
343
- 'Workdir must exist and must be a directory (or '
344
- f'a symlink to a directory). {self.workdir} not found.')
407
+ f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
408
+ 'destination path of a file mount, as it will be used '
409
+ 'by the workdir. If uploading a file/folder to the '
410
+ 'workdir is needed, please specify the full path to '
411
+ 'the file/folder.')
412
+
413
+ def expand_and_validate_workdir(self):
414
+ """Expand workdir to absolute path and validate it.
415
+
416
+ Note: if this function is called on a remote SkyPilot API server,
417
+ it must be after the client side has sync-ed all files to the
418
+ remote server.
419
+ """
420
+ if self.workdir is None:
421
+ return
422
+ user_workdir = self.workdir
423
+ self.workdir = os.path.abspath(os.path.expanduser(user_workdir))
424
+ if not os.path.isdir(self.workdir):
425
+ # Symlink to a dir is legal (isdir() follows symlinks).
426
+ with ux_utils.print_exception_no_traceback():
427
+ raise ValueError(
428
+ 'Workdir must be a valid directory (or '
429
+ f'a symlink to a directory). {user_workdir} not found.')
345
430
 
346
431
  @staticmethod
347
432
  def from_yaml_config(
@@ -393,6 +478,11 @@ class Task:
393
478
  config['service'] = _fill_in_env_vars(config['service'],
394
479
  config.get('envs', {}))
395
480
 
481
+ # Fill in any Task.envs into workdir
482
+ if config.get('workdir') is not None:
483
+ config['workdir'] = _fill_in_env_vars(config['workdir'],
484
+ config.get('envs', {}))
485
+
396
486
  task = Task(
397
487
  config.pop('name', None),
398
488
  run=config.pop('run', None),
@@ -401,6 +491,7 @@ class Task:
401
491
  num_nodes=config.pop('num_nodes', None),
402
492
  envs=config.pop('envs', None),
403
493
  event_callback=config.pop('event_callback', None),
494
+ file_mounts_mapping=config.pop('file_mounts_mapping', None),
404
495
  )
405
496
 
406
497
  # Create lists to store storage objects inlined in file_mounts.
@@ -456,8 +547,25 @@ class Task:
456
547
  task.set_outputs(outputs=outputs,
457
548
  estimated_size_gigabytes=estimated_size_gigabytes)
458
549
 
550
+ # Experimental configs.
551
+ experimnetal_configs = config.pop('experimental', None)
552
+ cluster_config_override = None
553
+ if experimnetal_configs is not None:
554
+ cluster_config_override = experimnetal_configs.pop(
555
+ 'config_overrides', None)
556
+ logger.debug('Overriding skypilot config with task-level config: '
557
+ f'{cluster_config_override}')
558
+ assert not experimnetal_configs, ('Invalid task args: '
559
+ f'{experimnetal_configs.keys()}')
560
+
459
561
  # Parse resources field.
460
- resources_config = config.pop('resources', None)
562
+ resources_config = config.pop('resources', {})
563
+ if cluster_config_override is not None:
564
+ assert resources_config.get('_cluster_config_overrides') is None, (
565
+ 'Cannot set _cluster_config_overrides in both resources and '
566
+ 'experimental.config_overrides')
567
+ resources_config[
568
+ '_cluster_config_overrides'] = cluster_config_override
461
569
  task.set_resources(sky.Resources.from_yaml_config(resources_config))
462
570
 
463
571
  service = config.pop('service', None)
@@ -560,6 +668,8 @@ class Task:
560
668
  if _check_docker_login_config(self._envs):
561
669
  self.resources = _with_docker_login_config(self.resources,
562
670
  self._envs)
671
+ self.resources = _with_docker_username_for_runpod(
672
+ self.resources, self._envs)
563
673
  return self
564
674
 
565
675
  @property
@@ -625,6 +735,9 @@ class Task:
625
735
  resources = {resources}
626
736
  # TODO(woosuk): Check if the resources are None.
627
737
  self.resources = _with_docker_login_config(resources, self.envs)
738
+ # Only have effect on RunPod.
739
+ self.resources = _with_docker_username_for_runpod(
740
+ self.resources, self.envs)
628
741
 
629
742
  # Evaluate if the task requires FUSE and set the requires_fuse flag
630
743
  for _, storage_obj in self.storage_mounts.items():
@@ -710,45 +823,7 @@ class Task:
710
823
 
711
824
  Returns:
712
825
  self: the current task, with file mounts set.
713
-
714
- Raises:
715
- ValueError: if input paths are invalid.
716
826
  """
717
- if file_mounts is None:
718
- self.file_mounts = None
719
- return self
720
- for target, source in file_mounts.items():
721
- if target.endswith('/') or source.endswith('/'):
722
- with ux_utils.print_exception_no_traceback():
723
- raise ValueError(
724
- 'File mount paths cannot end with a slash '
725
- '(try "/mydir: /mydir" or "/myfile: /myfile"). '
726
- f'Found: target={target} source={source}')
727
- if data_utils.is_cloud_store_url(target):
728
- with ux_utils.print_exception_no_traceback():
729
- raise ValueError(
730
- 'File mount destination paths cannot be cloud storage')
731
- if not data_utils.is_cloud_store_url(source):
732
- if (not os.path.exists(
733
- os.path.abspath(os.path.expanduser(source))) and
734
- not source.startswith('skypilot:')):
735
- with ux_utils.print_exception_no_traceback():
736
- raise ValueError(
737
- f'File mount source {source!r} does not exist '
738
- 'locally. To fix: check if it exists, and correct '
739
- 'the path.')
740
- # TODO(zhwu): /home/username/sky_workdir as the target path need
741
- # to be filtered out as well.
742
- if (target == constants.SKY_REMOTE_WORKDIR and
743
- self.workdir is not None):
744
- with ux_utils.print_exception_no_traceback():
745
- raise ValueError(
746
- f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
747
- 'destination path of a file mount, as it will be used '
748
- 'by the workdir. If uploading a file/folder to the '
749
- 'workdir is needed, please specify the full path to '
750
- 'the file/folder.')
751
-
752
827
  self.file_mounts = file_mounts
753
828
  return self
754
829
 
@@ -784,8 +859,8 @@ class Task:
784
859
  self.file_mounts = {}
785
860
  assert self.file_mounts is not None
786
861
  self.file_mounts.update(file_mounts)
787
- # For validation logic:
788
- return self.set_file_mounts(self.file_mounts)
862
+ self.expand_and_validate_file_mounts()
863
+ return self
789
864
 
790
865
  def set_storage_mounts(
791
866
  self,
@@ -933,18 +1008,46 @@ class Task:
933
1008
  file_mounts of the form ``{ /remote/path: {s3,gs,..}://<bucket path>
934
1009
  }``.
935
1010
  """
1011
+ # The same storage can be used multiple times, and we should construct
1012
+ # the storage with stores first, so that the storage will be created on
1013
+ # the correct cloud.
1014
+ name_to_storage = collections.defaultdict(list)
936
1015
  for storage in self.storage_mounts.values():
937
- if len(storage.stores) == 0:
938
- store_type, store_region = self._get_preferred_store()
939
- self.storage_plans[storage] = store_type
940
- storage.add_store(store_type, store_region)
941
- else:
942
- # We will download the first store that is added to remote.
943
- self.storage_plans[storage] = list(storage.stores.keys())[0]
944
-
1016
+ name_to_storage[storage.name].append(storage)
1017
+ for storages in name_to_storage.values():
1018
+ # Place the storage with most stores first, so that the storage will
1019
+ # be created on the correct cloud.
1020
+ storage_to_construct = sorted(storages,
1021
+ key=lambda x: len(x.stores),
1022
+ reverse=True)
1023
+ for storage in storage_to_construct:
1024
+ storage.construct()
1025
+ assert storage.name is not None, storage
1026
+ if not storage.stores:
1027
+ store_type, store_region = self._get_preferred_store()
1028
+ self.storage_plans[storage] = store_type
1029
+ storage.add_store(store_type, store_region)
1030
+ else:
1031
+ # We don't need to sync the storage here as if the stores
1032
+ # are not empty, it measn the storage has been synced during
1033
+ # construct() above.
1034
+ # We will download the first store that is added to remote.
1035
+ assert all(store is not None
1036
+ for store in storage.stores.values()), storage
1037
+ self.storage_plans[storage] = list(storage.stores.keys())[0]
1038
+
1039
+ # The following logic converts the storage mounts with COPY mode into
1040
+ # inline file mounts with cloud URIs, so that the _execute_file_mounts()
1041
+ # in cloud_vm_ray_backend.py can correctly download from the specific
1042
+ # cloud storage on the remote cluster.
1043
+ # Note that this will cause duplicate destination paths in file_mounts,
1044
+ # and storage_mounts, which should be fine as our to_yaml_config() will
1045
+ # only dump the storage mount version, i.e. what user specified.
945
1046
  storage_mounts = self.storage_mounts
946
1047
  storage_plans = self.storage_plans
947
1048
  for mnt_path, storage in storage_mounts.items():
1049
+ assert storage.name is not None, storage
1050
+
948
1051
  if storage.mode == storage_lib.StorageMode.COPY:
949
1052
  store_type = storage_plans[storage]
950
1053
  if store_type is storage_lib.StoreType.S3:
@@ -955,6 +1058,7 @@ class Task:
955
1058
  else:
956
1059
  assert storage.name is not None, storage
957
1060
  blob_path = 's3://' + storage.name
1061
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
958
1062
  self.update_file_mounts({
959
1063
  mnt_path: blob_path,
960
1064
  })
@@ -965,6 +1069,26 @@ class Task:
965
1069
  else:
966
1070
  assert storage.name is not None, storage
967
1071
  blob_path = 'gs://' + storage.name
1072
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
1073
+ self.update_file_mounts({
1074
+ mnt_path: blob_path,
1075
+ })
1076
+ elif store_type is storage_lib.StoreType.AZURE:
1077
+ if (isinstance(storage.source, str) and
1078
+ data_utils.is_az_container_endpoint(
1079
+ storage.source)):
1080
+ blob_path = storage.source
1081
+ else:
1082
+ assert storage.name is not None, storage
1083
+ store_object = storage.stores[
1084
+ storage_lib.StoreType.AZURE]
1085
+ assert isinstance(store_object,
1086
+ storage_lib.AzureBlobStore)
1087
+ storage_account_name = store_object.storage_account_name
1088
+ blob_path = data_utils.AZURE_CONTAINER_URL.format(
1089
+ storage_account_name=storage_account_name,
1090
+ container_name=storage.name)
1091
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
968
1092
  self.update_file_mounts({
969
1093
  mnt_path: blob_path,
970
1094
  })
@@ -975,6 +1099,7 @@ class Task:
975
1099
  blob_path = storage.source
976
1100
  else:
977
1101
  blob_path = 'r2://' + storage.name
1102
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
978
1103
  self.update_file_mounts({
979
1104
  mnt_path: blob_path,
980
1105
  })
@@ -990,15 +1115,29 @@ class Task:
990
1115
  cos_region = data_utils.Rclone.get_region_from_rclone(
991
1116
  storage.name, data_utils.Rclone.RcloneClouds.IBM)
992
1117
  blob_path = f'cos://{cos_region}/{storage.name}'
1118
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
993
1119
  self.update_file_mounts({mnt_path: blob_path})
994
- elif store_type is storage_lib.StoreType.AZURE:
995
- # TODO when Azure Blob is done: sync ~/.azure
996
- raise NotImplementedError('Azure Blob not mountable yet')
1120
+ elif store_type is storage_lib.StoreType.OCI:
1121
+ if storage.source is not None and not isinstance(
1122
+ storage.source,
1123
+ list) and storage.source.startswith('oci://'):
1124
+ blob_path = storage.source
1125
+ else:
1126
+ blob_path = 'oci://' + storage.name
1127
+ self.update_file_mounts({
1128
+ mnt_path: blob_path,
1129
+ })
997
1130
  else:
998
1131
  with ux_utils.print_exception_no_traceback():
999
1132
  raise ValueError(f'Storage Type {store_type} '
1000
1133
  'does not exist!')
1001
1134
 
1135
+ # TODO: Delete from storage_mounts, now that the storage is
1136
+ # translated into file_mounts. Note: as is, this will break
1137
+ # controller_utils.
1138
+ # _maybe_translate_local_file_mounts_and_sync_up(), which still
1139
+ # needs the storage, but not the file_mounts.
1140
+
1002
1141
  def get_local_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
1003
1142
  """Returns file mounts of the form (dst=VM path, src=local path).
1004
1143
 
@@ -1093,6 +1232,8 @@ class Task:
1093
1232
  mount_path: storage.to_yaml_config()
1094
1233
  for mount_path, storage in self.storage_mounts.items()
1095
1234
  })
1235
+
1236
+ add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
1096
1237
  return config
1097
1238
 
1098
1239
  def get_required_cloud_features(