skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/utils/common_utils.py CHANGED
@@ -5,7 +5,7 @@ import functools
5
5
  import getpass
6
6
  import hashlib
7
7
  import inspect
8
- import json
8
+ import io
9
9
  import os
10
10
  import platform
11
11
  import random
@@ -16,20 +16,21 @@ import time
16
16
  from typing import Any, Callable, Dict, List, Optional, Union
17
17
  import uuid
18
18
 
19
- import colorama
20
19
  import jinja2
21
20
  import jsonschema
21
+ import psutil
22
22
  import yaml
23
23
 
24
24
  from sky import exceptions
25
25
  from sky import sky_logging
26
26
  from sky.skylet import constants
27
+ from sky.usage import constants as usage_constants
28
+ from sky.utils import annotations
27
29
  from sky.utils import ux_utils
28
30
  from sky.utils import validator
29
31
 
30
32
  _USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
31
33
  USER_HASH_LENGTH = 8
32
- USER_HASH_LENGTH_IN_CLUSTER_NAME = 4
33
34
 
34
35
  # We are using base36 to reduce the length of the hash. 2 chars -> 36^2 = 1296
35
36
  # possibilities. considering the final cluster name contains the prefix as well,
@@ -38,16 +39,12 @@ CLUSTER_NAME_HASH_LENGTH = 2
38
39
 
39
40
  _COLOR_PATTERN = re.compile(r'\x1b[^m]*m')
40
41
 
41
- _PAYLOAD_PATTERN = re.compile(r'<sky-payload>(.*)</sky-payload>')
42
- _PAYLOAD_STR = '<sky-payload>{}</sky-payload>'
43
-
44
42
  _VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
45
43
 
46
44
  logger = sky_logging.init_logger(__name__)
47
45
 
48
- _usage_run_id = None
49
-
50
46
 
47
+ @annotations.lru_cache(scope='request')
51
48
  def get_usage_run_id() -> str:
52
49
  """Returns a unique run id for each 'run'.
53
50
 
@@ -55,42 +52,44 @@ def get_usage_run_id() -> str:
55
52
  and has called its CLI or programmatic APIs. For example, two successive
56
53
  `sky launch` are two runs.
57
54
  """
58
- global _usage_run_id
59
- if _usage_run_id is None:
60
- _usage_run_id = str(uuid.uuid4())
61
- return _usage_run_id
55
+ usage_run_id = os.getenv(usage_constants.USAGE_RUN_ID_ENV_VAR)
56
+ if usage_run_id is not None:
57
+ return usage_run_id
58
+ return str(uuid.uuid4())
59
+
60
+
61
+ def _is_valid_user_hash(user_hash: Optional[str]) -> bool:
62
+ if user_hash is None:
63
+ return False
64
+ try:
65
+ int(user_hash, 16)
66
+ except (TypeError, ValueError):
67
+ return False
68
+ return len(user_hash) == USER_HASH_LENGTH
69
+
70
+
71
+ def generate_user_hash() -> str:
72
+ """Generates a unique user-machine specific hash."""
73
+ hash_str = user_and_hostname_hash()
74
+ user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
75
+ if not _is_valid_user_hash(user_hash):
76
+ # A fallback in case the hash is invalid.
77
+ user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
78
+ return user_hash
62
79
 
63
80
 
64
- def get_user_hash(force_fresh_hash: bool = False) -> str:
81
+ def get_user_hash() -> str:
65
82
  """Returns a unique user-machine specific hash as a user id.
66
83
 
67
84
  We cache the user hash in a file to avoid potential user_name or
68
85
  hostname changes causing a new user hash to be generated.
69
-
70
- Args:
71
- force_fresh_hash: Bypasses the cached hash in USER_HASH_FILE and the
72
- hash in the USER_ID_ENV_VAR and forces a fresh user-machine hash
73
- to be generated. Used by `kubernetes.ssh_key_secret_field_name` to
74
- avoid controllers sharing the same ssh key field name as the
75
- local client.
76
86
  """
87
+ user_hash = os.getenv(constants.USER_ID_ENV_VAR)
88
+ if _is_valid_user_hash(user_hash):
89
+ assert user_hash is not None
90
+ return user_hash
77
91
 
78
- def _is_valid_user_hash(user_hash: Optional[str]) -> bool:
79
- if user_hash is None:
80
- return False
81
- try:
82
- int(user_hash, 16)
83
- except (TypeError, ValueError):
84
- return False
85
- return len(user_hash) == USER_HASH_LENGTH
86
-
87
- if not force_fresh_hash:
88
- user_hash = os.getenv(constants.USER_ID_ENV_VAR)
89
- if _is_valid_user_hash(user_hash):
90
- assert user_hash is not None
91
- return user_hash
92
-
93
- if not force_fresh_hash and os.path.exists(_USER_HASH_FILE):
92
+ if os.path.exists(_USER_HASH_FILE):
94
93
  # Read from cached user hash file.
95
94
  with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f:
96
95
  # Remove invalid characters.
@@ -98,19 +97,10 @@ def get_user_hash(force_fresh_hash: bool = False) -> str:
98
97
  if _is_valid_user_hash(user_hash):
99
98
  return user_hash
100
99
 
101
- hash_str = user_and_hostname_hash()
102
- user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
103
- if not _is_valid_user_hash(user_hash):
104
- # A fallback in case the hash is invalid.
105
- user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
100
+ user_hash = generate_user_hash()
106
101
  os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True)
107
- if not force_fresh_hash:
108
- # Do not cache to file if force_fresh_hash is True since the file may
109
- # be intentionally using a different hash, e.g. we want to keep the
110
- # user_hash for usage collection the same on the jobs/serve controller
111
- # as users' local client.
112
- with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
113
- f.write(user_hash)
102
+ with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
103
+ f.write(user_hash)
114
104
  return user_hash
115
105
 
116
106
 
@@ -183,7 +173,7 @@ def make_cluster_name_on_cloud(display_name: str,
183
173
  f'on the cloud, we convert it to {cluster_name_on_cloud}.')
184
174
  user_hash = ''
185
175
  if add_user_hash:
186
- user_hash = get_user_hash()[:USER_HASH_LENGTH_IN_CLUSTER_NAME]
176
+ user_hash = get_user_hash()
187
177
  user_hash = f'-{user_hash}'
188
178
  user_hash_length = len(user_hash)
189
179
 
@@ -233,7 +223,7 @@ class Backoff:
233
223
  MULTIPLIER = 1.6
234
224
  JITTER = 0.4
235
225
 
236
- def __init__(self, initial_backoff: int = 5, max_backoff_factor: int = 5):
226
+ def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
237
227
  self._initial = True
238
228
  self._backoff = 0.0
239
229
  self._initial_backoff = initial_backoff
@@ -255,7 +245,62 @@ class Backoff:
255
245
  return self._backoff
256
246
 
257
247
 
258
- def get_pretty_entry_point() -> str:
248
+ _current_command: Optional[str] = None
249
+ _current_client_entrypoint: Optional[str] = None
250
+ _using_remote_api_server: Optional[bool] = None
251
+
252
+
253
+ def set_client_status(client_entrypoint: Optional[str],
254
+ client_command: Optional[str],
255
+ using_remote_api_server: bool):
256
+ """Override the current client entrypoint and command.
257
+
258
+ This is useful when we are on the SkyPilot API server side and we have a
259
+ client entrypoint and command from the client.
260
+ """
261
+ global _current_command
262
+ global _current_client_entrypoint
263
+ global _using_remote_api_server
264
+ _current_command = client_command
265
+ _current_client_entrypoint = client_entrypoint
266
+ _using_remote_api_server = using_remote_api_server
267
+
268
+
269
+ def get_current_command() -> str:
270
+ """Returns the command related to this operation.
271
+
272
+ Normally uses get_pretty_entry_point(), but will use the client command on
273
+ the server side.
274
+ """
275
+ if _current_command is not None:
276
+ return _current_command
277
+
278
+ return get_pretty_entrypoint_cmd()
279
+
280
+
281
+ def get_current_client_entrypoint(server_entrypoint: str) -> str:
282
+ """Returns the current client entrypoint.
283
+
284
+ Gets the client entrypoint from the context, if it is not set, returns the
285
+ server entrypoint.
286
+ """
287
+ if _current_client_entrypoint is not None:
288
+ return _current_client_entrypoint
289
+ return server_entrypoint
290
+
291
+
292
+ def get_using_remote_api_server() -> bool:
293
+ """Returns whether the API server is remote."""
294
+ if _using_remote_api_server is not None:
295
+ return _using_remote_api_server
296
+ # This gets the right status for the local client.
297
+ # TODO(zhwu): This is to prevent circular import. We should refactor this.
298
+ # pylint: disable=import-outside-toplevel
299
+ from sky.server import common as server_common
300
+ return not server_common.is_api_server_local()
301
+
302
+
303
+ def get_pretty_entrypoint_cmd() -> str:
259
304
  """Returns the prettified entry point of this process (sys.argv).
260
305
 
261
306
  Example return values:
@@ -300,28 +345,51 @@ def user_and_hostname_hash() -> str:
300
345
  return f'{getpass.getuser()}-{hostname_hash}'
301
346
 
302
347
 
303
- def read_yaml(path) -> Dict[str, Any]:
348
+ def read_yaml(path: Optional[str]) -> Dict[str, Any]:
349
+ if path is None:
350
+ raise ValueError('Attempted to read a None YAML.')
304
351
  with open(path, 'r', encoding='utf-8') as f:
305
352
  config = yaml.safe_load(f)
306
353
  return config
307
354
 
308
355
 
356
+ def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
357
+ stream = io.StringIO(yaml_str)
358
+ config = yaml.safe_load_all(stream)
359
+ configs = list(config)
360
+ if not configs:
361
+ # Empty YAML file.
362
+ return [{}]
363
+ return configs
364
+
365
+
309
366
  def read_yaml_all(path: str) -> List[Dict[str, Any]]:
310
367
  with open(path, 'r', encoding='utf-8') as f:
311
- config = yaml.safe_load_all(f)
312
- configs = list(config)
313
- if not configs:
314
- # Empty YAML file.
315
- return [{}]
316
- return configs
368
+ return read_yaml_all_str(f.read())
369
+
317
370
 
371
+ def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
372
+ Dict[str, Any]]) -> None:
373
+ """Dumps a YAML file.
318
374
 
319
- def dump_yaml(path, config) -> None:
375
+ Args:
376
+ path: the path to the YAML file.
377
+ config: the configuration to dump.
378
+ """
320
379
  with open(path, 'w', encoding='utf-8') as f:
321
380
  f.write(dump_yaml_str(config))
322
381
 
323
382
 
324
- def dump_yaml_str(config):
383
+ def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
384
+ """Dumps a YAML string.
385
+
386
+ Args:
387
+ config: the configuration to dump.
388
+
389
+ Returns:
390
+ The YAML string.
391
+ """
392
+
325
393
  # https://github.com/yaml/pyyaml/issues/127
326
394
  class LineBreakDumper(yaml.SafeDumper):
327
395
 
@@ -331,9 +399,9 @@ def dump_yaml_str(config):
331
399
  super().write_line_break()
332
400
 
333
401
  if isinstance(config, list):
334
- dump_func = yaml.dump_all
402
+ dump_func = yaml.dump_all # type: ignore
335
403
  else:
336
- dump_func = yaml.dump
404
+ dump_func = yaml.dump # type: ignore
337
405
  return dump_func(config,
338
406
  Dumper=LineBreakDumper,
339
407
  sort_keys=False,
@@ -362,7 +430,6 @@ def make_decorator(cls, name_or_fn: Union[str, Callable],
362
430
 
363
431
  @functools.wraps(f)
364
432
  def _record(*args, **kwargs):
365
- nonlocal name_or_fn
366
433
  with cls(name_or_fn, **ctx_kwargs):
367
434
  return f(*args, **kwargs)
368
435
 
@@ -376,7 +443,6 @@ def make_decorator(cls, name_or_fn: Union[str, Callable],
376
443
 
377
444
  @functools.wraps(name_or_fn)
378
445
  def _record(*args, **kwargs):
379
- nonlocal name_or_fn
380
446
  f = name_or_fn
381
447
  func_name = getattr(f, '__qualname__', f.__name__)
382
448
  module_name = getattr(f, '__module__', '')
@@ -411,43 +477,6 @@ def retry(method, max_retries=3, initial_backoff=1):
411
477
  return method_with_retries
412
478
 
413
479
 
414
- def encode_payload(payload: Any) -> str:
415
- """Encode a payload to make it more robust for parsing.
416
-
417
- This makes message transfer more robust to any additional strings added to
418
- the message during transfer.
419
-
420
- An example message that is polluted by the system warning:
421
- "LC_ALL: cannot change locale (en_US.UTF-8)\n<sky-payload>hello, world</sky-payload>" # pylint: disable=line-too-long
422
-
423
- Args:
424
- payload: A str, dict or list to be encoded.
425
-
426
- Returns:
427
- A string that is encoded from the payload.
428
- """
429
- payload_str = json.dumps(payload)
430
- payload_str = _PAYLOAD_STR.format(payload_str)
431
- return payload_str
432
-
433
-
434
- def decode_payload(payload_str: str) -> Any:
435
- """Decode a payload string.
436
-
437
- Args:
438
- payload_str: A string that is encoded from a payload.
439
-
440
- Returns:
441
- A str, dict or list that is decoded from the payload string.
442
- """
443
- matched = _PAYLOAD_PATTERN.findall(payload_str)
444
- if not matched:
445
- raise ValueError(f'Invalid payload string: \n{payload_str}')
446
- payload_str = matched[0]
447
- payload = json.loads(payload_str)
448
- return payload
449
-
450
-
451
480
  def class_fullname(cls, skip_builtins: bool = True):
452
481
  """Get the full name of a class.
453
482
 
@@ -478,11 +507,9 @@ def format_exception(e: Union[Exception, SystemExit, KeyboardInterrupt],
478
507
  Returns:
479
508
  A string that represents the exception.
480
509
  """
481
- bright = colorama.Style.BRIGHT
482
- reset = colorama.Style.RESET_ALL
483
510
  if use_bracket:
484
- return f'{bright}[{class_fullname(e.__class__)}]{reset} {e}'
485
- return f'{bright}{class_fullname(e.__class__)}:{reset} {e}'
511
+ return f'[{class_fullname(e.__class__)}] {e}'
512
+ return f'{class_fullname(e.__class__)}: {e}'
486
513
 
487
514
 
488
515
  def remove_color(s: str):
@@ -497,12 +524,14 @@ def remove_color(s: str):
497
524
  return _COLOR_PATTERN.sub('', s)
498
525
 
499
526
 
500
- def remove_file_if_exists(path: str):
527
+ def remove_file_if_exists(path: Optional[str]):
501
528
  """Delete a file if it exists.
502
529
 
503
530
  Args:
504
531
  path: The path to the file.
505
532
  """
533
+ if path is None:
534
+ return
506
535
  try:
507
536
  os.remove(path)
508
537
  except FileNotFoundError:
@@ -581,7 +610,10 @@ def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
581
610
  e.message)
582
611
  else:
583
612
  err_msg = err_msg_prefix
613
+ assert isinstance(e.schema, dict), 'Schema must be a dictionary'
584
614
  known_fields = set(e.schema.get('properties', {}).keys())
615
+ assert isinstance(e.instance,
616
+ dict), 'Instance must be a dictionary'
585
617
  for field in e.instance:
586
618
  if field not in known_fields:
587
619
  most_similar_field = difflib.get_close_matches(
@@ -602,7 +634,7 @@ def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
602
634
 
603
635
  if err_msg:
604
636
  with ux_utils.print_exception_no_traceback():
605
- raise ValueError(err_msg)
637
+ raise exceptions.InvalidSkyPilotConfigError(err_msg)
606
638
 
607
639
 
608
640
  def get_cleaned_username(username: str = '') -> str:
@@ -634,7 +666,7 @@ def get_cleaned_username(username: str = '') -> str:
634
666
  return username
635
667
 
636
668
 
637
- def fill_template(template_name: str, variables: Dict,
669
+ def fill_template(template_name: str, variables: Dict[str, Any],
638
670
  output_path: str) -> None:
639
671
  """Create a file from a Jinja template and return the filename."""
640
672
  assert template_name.endswith('.j2'), template_name
@@ -678,3 +710,182 @@ def deprecated_function(
678
710
  return func(*args, **kwargs)
679
711
 
680
712
  return new_func
713
+
714
+
715
+ def truncate_long_string(s: str, max_length: int = 35) -> str:
716
+ """Truncate a string to a maximum length, preserving whole words."""
717
+ if len(s) <= max_length:
718
+ return s
719
+ splits = s.split(' ')
720
+ if len(splits[0]) > max_length:
721
+ return splits[0][:max_length] + '...' # Use '…'?
722
+ # Truncate on word boundary.
723
+ i = 0
724
+ total = 0
725
+ for i, part in enumerate(splits):
726
+ total += len(part)
727
+ if total >= max_length:
728
+ break
729
+ prefix = ' '.join(splits[:i])
730
+ if len(prefix) < max_length:
731
+ prefix += s[len(prefix):max_length]
732
+ return prefix + '...'
733
+
734
+
735
+ def hash_file(path: str, hash_alg: str) -> 'hashlib._Hash':
736
+ # In python 3.11, hashlib.file_digest is available, but for <3.11 we have to
737
+ # do it manually.
738
+ # This implementation is simplified from the implementation in CPython.
739
+ # TODO(cooperc): Use hashlib.file_digest once we move to 3.11+.
740
+ # Beware of f.read() as some files may be larger than memory.
741
+ with open(path, 'rb') as f:
742
+ file_hash = hashlib.new(hash_alg)
743
+ buf = bytearray(2**18)
744
+ view = memoryview(buf)
745
+ while True:
746
+ size = f.readinto(buf)
747
+ if size == 0:
748
+ # EOF
749
+ break
750
+ file_hash.update(view[:size])
751
+ return file_hash
752
+
753
+
754
+ def is_port_available(port: int, reuse_addr: bool = True) -> bool:
755
+ """Check if a TCP port is available for binding on localhost.
756
+
757
+ Args:
758
+ port: The port number to check.
759
+ reuse_addr: If True, sets SO_REUSEADDR socket option to allow reusing
760
+ ports in TIME_WAIT state. Servers like multiprocessing.Manager set
761
+ SO_REUSEADDR by default to accelerate restart. The option should be
762
+ coordinated in check.
763
+
764
+ Returns:
765
+ bool: True if the port is available for binding, False otherwise.
766
+ """
767
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
768
+ if reuse_addr:
769
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
770
+ try:
771
+ s.bind(('localhost', port))
772
+ return True
773
+ except OSError:
774
+ return False
775
+
776
+
777
+ def get_cpu_count() -> int:
778
+ """Get the number of CPUs, with cgroup awareness."""
779
+ # This env-var is kept since it is still useful for limiting the resource
780
+ # of SkyPilot in non-containerized environments.
781
+ cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
782
+ if cpu_count is not None:
783
+ try:
784
+ return int(float(cpu_count))
785
+ except ValueError as e:
786
+ with ux_utils.print_exception_no_traceback():
787
+ raise ValueError(
788
+ f'Failed to parse the number of CPUs from {cpu_count}'
789
+ ) from e
790
+ return _cpu_count()
791
+
792
+
793
+ def get_mem_size_gb() -> float:
794
+ """Get the memory size in GB, with cgroup awareness."""
795
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
796
+ if mem_size is not None:
797
+ try:
798
+ return float(mem_size)
799
+ except ValueError as e:
800
+ with ux_utils.print_exception_no_traceback():
801
+ raise ValueError(
802
+ f'Failed to parse the memory size from {mem_size}') from e
803
+ return _mem_size_gb()
804
+
805
+
806
+ def _cpu_count() -> int:
807
+ # host cpu cores (logical)
808
+ cpu = psutil.cpu_count()
809
+ # cpu affinity on Linux
810
+ if hasattr(os, 'sched_getaffinity'):
811
+ # just for safe, length of CPU set should always <= logical cpu cores
812
+ cpu = min(cpu, len(os.sched_getaffinity(0)))
813
+ cgroup_cpu = _get_cgroup_cpu_limit()
814
+ if cgroup_cpu is not None:
815
+ cpu = min(cpu, int(cgroup_cpu))
816
+ return cpu
817
+
818
+
819
+ def _mem_size_gb() -> float:
820
+ # host memory limit
821
+ mem = psutil.virtual_memory().total
822
+ cgroup_mem = _get_cgroup_memory_limit()
823
+ if cgroup_mem is not None:
824
+ mem = min(mem, cgroup_mem)
825
+ return mem / (1024**3)
826
+
827
+
828
+ # Refer to:
829
+ # - https://docs.kernel.org/admin-guide/cgroup-v1/index.html
830
+ # - https://docs.kernel.org/admin-guide/cgroup-v2.html
831
+ # for the standards of handler files in cgroupv1 and v2.
832
+ # Since all those paths are well-known standards that are unlikely to change,
833
+ # we use string literals instead of defining extra constants.
834
+ def _get_cgroup_cpu_limit() -> Optional[float]:
835
+ """Return cpu limit from cgroups in cores.
836
+
837
+ Returns:
838
+ The cpu limit in cores as a float (can be fractional), or None if there
839
+ is no limit in cgroups.
840
+ """
841
+ try:
842
+ if _is_cgroup_v2():
843
+ with open('/sys/fs/cgroup/cpu.max', 'r', encoding='utf-8') as f:
844
+ quota_str, period_str = f.read().strip().split()
845
+ if quota_str == 'max':
846
+ return None
847
+ quota = float(quota_str)
848
+ period = float(period_str)
849
+ return quota / period if quota > 0 else None
850
+ else:
851
+ # cgroup v1
852
+ with open('/sys/fs/cgroup/cpu/cpu.cfs_quota_us',
853
+ 'r',
854
+ encoding='utf-8') as f:
855
+ quota = float(f.read().strip())
856
+ with open('/sys/fs/cgroup/cpu/cpu.cfs_period_us',
857
+ 'r',
858
+ encoding='utf-8') as f:
859
+ period = float(f.read().strip())
860
+ # Return unlimited if cpu quota is not set.
861
+ # Note that we do not use cpu.shares since it is a relative weight
862
+ # instead of a hard limit. It is okay to get CPU throttling under
863
+ # high contention. And unlimited enables the server to use as much
864
+ # CPU as available if there is no contention.
865
+ return quota / period if (quota > 0 and period > 0) else None
866
+ except (OSError, ValueError):
867
+ return None
868
+
869
+
870
+ def _get_cgroup_memory_limit() -> Optional[int]:
871
+ """Return memory limit from cgroups in bytes.
872
+
873
+ Returns:
874
+ The memory limit in bytes, or None if there is no limit in cgroups.
875
+ """
876
+ try:
877
+ path = ('/sys/fs/cgroup/memory.max' if _is_cgroup_v2() else
878
+ '/sys/fs/cgroup/memory/memory.limit_in_bytes')
879
+ with open(path, 'r', encoding='utf-8') as f:
880
+ value = f.read().strip()
881
+ if value == 'max' or not value:
882
+ return None
883
+ limit = int(value)
884
+ return limit if limit > 0 else None
885
+ except (OSError, ValueError):
886
+ return None
887
+
888
+
889
+ def _is_cgroup_v2() -> bool:
890
+ """Return True if the environment is running cgroup v2."""
891
+ return os.path.isfile('/sys/fs/cgroup/cgroup.controllers')