skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -2,21 +2,26 @@
2
2
  from multiprocessing import pool
3
3
  import os
4
4
  import random
5
+ import resource
6
+ import shlex
5
7
  import subprocess
6
8
  import time
7
- from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
9
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
10
 
9
11
  import colorama
10
12
  import psutil
11
13
 
12
14
  from sky import exceptions
13
15
  from sky import sky_logging
16
+ from sky.skylet import constants
14
17
  from sky.skylet import log_lib
15
18
  from sky.utils import timeline
16
19
  from sky.utils import ux_utils
17
20
 
18
21
  logger = sky_logging.init_logger(__name__)
19
22
 
23
+ _fd_limit_warning_shown = False
24
+
20
25
 
21
26
  @timeline.event
22
27
  def run(cmd, **kwargs):
@@ -42,27 +47,86 @@ def run_no_outputs(cmd, **kwargs):
42
47
  **kwargs)
43
48
 
44
49
 
45
- def get_parallel_threads() -> int:
46
- """Returns the number of idle CPUs."""
50
+ def _get_thread_multiplier(cloud_str: Optional[str] = None) -> int:
51
+ # If using Kubernetes, we use 4x the number of cores.
52
+ if cloud_str and cloud_str.lower() == 'kubernetes':
53
+ return 4
54
+ return 1
55
+
56
+
57
+ def get_max_workers_for_file_mounts(common_file_mounts: Dict[str, str],
58
+ cloud_str: Optional[str] = None) -> int:
59
+ global _fd_limit_warning_shown
60
+ fd_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
61
+
62
+ # Raise warning for low fd_limit (only once)
63
+ if fd_limit < 1024 and not _fd_limit_warning_shown:
64
+ logger.warning(
65
+ f'Open file descriptor limit ({fd_limit}) is low. File sync to '
66
+ 'remote clusters may be slow. Consider increasing the limit using '
67
+ '`ulimit -n <number>` or modifying system limits.')
68
+ _fd_limit_warning_shown = True
69
+
70
+ fd_per_rsync = 5
71
+ for src in common_file_mounts.values():
72
+ if os.path.isdir(src):
73
+ # Assume that each file/folder under src takes 5 file descriptors
74
+ # on average.
75
+ fd_per_rsync = max(fd_per_rsync, len(os.listdir(src)) * 5)
76
+
77
+ # Reserve some file descriptors for the system and other processes
78
+ fd_reserve = 100
79
+
80
+ max_workers = (fd_limit - fd_reserve) // fd_per_rsync
81
+ # At least 1 worker, and avoid too many workers overloading the system.
82
+ num_threads = get_parallel_threads(cloud_str)
83
+ max_workers = min(max(max_workers, 1), num_threads)
84
+ logger.debug(f'Using {max_workers} workers for file mounts.')
85
+ return max_workers
86
+
87
+
88
+ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
89
+ """Returns the number of threads to use for parallel execution.
90
+
91
+ Args:
92
+ cloud_str: The cloud
93
+ """
47
94
  cpu_count = os.cpu_count()
48
95
  if cpu_count is None:
49
96
  cpu_count = 1
50
- return max(4, cpu_count - 1)
97
+ return max(4, cpu_count - 1) * _get_thread_multiplier(cloud_str)
51
98
 
52
99
 
53
- def run_in_parallel(func: Callable, args: Iterable[Any]) -> List[Any]:
100
+ def run_in_parallel(func: Callable,
101
+ args: List[Any],
102
+ num_threads: Optional[int] = None) -> List[Any]:
54
103
  """Run a function in parallel on a list of arguments.
55
104
 
56
- The function 'func' should raise a CommandError if the command fails.
105
+ Args:
106
+ func: The function to run in parallel
107
+ args: Iterable of arguments to pass to func
108
+ num_threads: Number of threads to use. If None, uses
109
+ get_parallel_threads()
57
110
 
58
111
  Returns:
59
112
  A list of the return values of the function func, in the same order as the
60
- arguments.
113
+ arguments.
114
+
115
+ Raises:
116
+ Exception: The first exception encountered.
61
117
  """
62
- # Reference: https://stackoverflow.com/questions/25790279/python-multiprocessing-early-termination # pylint: disable=line-too-long
63
- with pool.ThreadPool(processes=get_parallel_threads()) as p:
64
- # Run the function in parallel on the arguments, keeping the order.
65
- return list(p.imap(func, args))
118
+ # Short-circuit for short lists
119
+ if len(args) == 0:
120
+ return []
121
+ if len(args) == 1:
122
+ return [func(args[0])]
123
+
124
+ processes = (num_threads
125
+ if num_threads is not None else get_parallel_threads())
126
+
127
+ with pool.ThreadPool(processes=processes) as p:
128
+ ordered_iterators = p.imap(func, args)
129
+ return list(ordered_iterators)
66
130
 
67
131
 
68
132
  def handle_returncode(returncode: int,
@@ -77,8 +141,9 @@ def handle_returncode(returncode: int,
77
141
  command: The command that was run.
78
142
  error_msg: The error message to print.
79
143
  stderr: The stderr of the command.
144
+ stream_logs: Whether to stream logs.
80
145
  """
81
- echo = logger.error if stream_logs else lambda _: None
146
+ echo = logger.error if stream_logs else logger.debug
82
147
  if returncode != 0:
83
148
  if stderr is not None:
84
149
  echo(stderr)
@@ -92,9 +157,9 @@ def handle_returncode(returncode: int,
92
157
  stderr)
93
158
 
94
159
 
95
- def kill_children_processes(
96
- first_pid_to_kill: Optional[Union[int, List[Optional[int]]]] = None,
97
- force: bool = False):
160
+ def kill_children_processes(parent_pids: Optional[Union[
161
+ int, List[Optional[int]]]] = None,
162
+ force: bool = False) -> None:
98
163
  """Kill children processes recursively.
99
164
 
100
165
  We need to kill the children, so that
@@ -104,41 +169,57 @@ def kill_children_processes(
104
169
  etc. while we are cleaning up the clusters.
105
170
 
106
171
  Args:
107
- first_pid_to_kill: Optional PID of a process, or PIDs of a series of
108
- processes to be killed first. If a list of PID is specified, it is
109
- killed by the order in the list.
110
- This is for guaranteeing the order of cleaning up and suppress
111
- flaky errors.
172
+ parent_pids: Optional PIDs of a series of processes. The processes and
173
+ their children will be killed. If a list of PID is specified, it is
174
+ killed by the order in the list. This is for guaranteeing the order
175
+ of cleaning up and suppress flaky errors.
176
+ force: bool, send SIGKILL if force, otherwise, use SIGTERM for
177
+ gracefully kill the process.
112
178
  """
113
- pid_to_proc = dict()
114
- child_processes = []
115
- if isinstance(first_pid_to_kill, int):
116
- first_pid_to_kill = [first_pid_to_kill]
117
- elif first_pid_to_kill is None:
118
- first_pid_to_kill = []
119
-
120
- def _kill_processes(processes: List[psutil.Process]) -> None:
121
- for process in processes:
179
+ if isinstance(parent_pids, int):
180
+ parent_pids = [parent_pids]
181
+
182
+ def kill(proc: psutil.Process):
183
+ if not proc.is_running():
184
+ # Skip if the process is not running.
185
+ return
186
+ logger.debug(f'Killing process {proc.pid}')
187
+ try:
188
+ if force:
189
+ proc.kill()
190
+ else:
191
+ proc.terminate()
192
+ proc.wait(timeout=10)
193
+ except psutil.NoSuchProcess:
194
+ # The child process may have already been terminated.
195
+ pass
196
+ except psutil.TimeoutExpired:
197
+ logger.debug(
198
+ f'Process {proc.pid} did not terminate after 10 seconds')
199
+ # Attempt to force kill if the normal termination fails
200
+ if not force:
201
+ logger.debug(f'Force killing process {proc.pid}')
202
+ proc.kill()
203
+ proc.wait(timeout=5) # Shorter timeout after force kill
204
+
205
+ parent_processes = []
206
+ if parent_pids is None:
207
+ parent_processes = [psutil.Process()]
208
+ else:
209
+ for pid in parent_pids:
122
210
  try:
123
- if force:
124
- process.kill()
125
- else:
126
- process.terminate()
211
+ process = psutil.Process(pid)
127
212
  except psutil.NoSuchProcess:
128
- # The process may have already been terminated.
129
- pass
130
-
131
- parent_process = psutil.Process()
132
- for child in parent_process.children(recursive=True):
133
- if child.pid in first_pid_to_kill:
134
- pid_to_proc[child.pid] = child
135
- else:
136
- child_processes.append(child)
213
+ continue
214
+ parent_processes.append(process)
137
215
 
138
- _kill_processes([
139
- pid_to_proc[proc] for proc in first_pid_to_kill if proc in pid_to_proc
140
- ])
141
- _kill_processes(child_processes)
216
+ for parent_process in parent_processes:
217
+ child_processes = parent_process.children(recursive=True)
218
+ if parent_pids is not None:
219
+ kill(parent_process)
220
+ logger.debug(f'Killing child processes: {child_processes}')
221
+ for child in child_processes:
222
+ kill(child)
142
223
 
143
224
 
144
225
  def run_with_retries(
@@ -187,3 +268,88 @@ def run_with_retries(
187
268
  continue
188
269
  break
189
270
  return returncode, stdout, stderr
271
+
272
+
273
+ def kill_process_daemon(process_pid: int) -> None:
274
+ """Start a daemon as a safety net to kill the process.
275
+
276
+ Args:
277
+ process_pid: The PID of the process to kill.
278
+ """
279
+ # Get initial children list
280
+ try:
281
+ process = psutil.Process(process_pid)
282
+ initial_children = [p.pid for p in process.children(recursive=True)]
283
+ except psutil.NoSuchProcess:
284
+ initial_children = []
285
+
286
+ parent_pid = os.getpid()
287
+ daemon_script = os.path.join(
288
+ os.path.dirname(os.path.abspath(log_lib.__file__)),
289
+ 'subprocess_daemon.py')
290
+ python_path = subprocess.check_output(constants.SKY_GET_PYTHON_PATH_CMD,
291
+ shell=True,
292
+ stderr=subprocess.DEVNULL,
293
+ encoding='utf-8').strip()
294
+ daemon_cmd = [
295
+ python_path,
296
+ daemon_script,
297
+ '--parent-pid',
298
+ str(parent_pid),
299
+ '--proc-pid',
300
+ str(process_pid),
301
+ # We pass the initial children list to avoid the race condition where
302
+ # the process_pid is terminated before the daemon starts and gets the
303
+ # children list.
304
+ '--initial-children',
305
+ ','.join(map(str, initial_children)),
306
+ ]
307
+
308
+ # We do not need to set `start_new_session=True` here, as the
309
+ # daemon script will detach itself from the parent process with
310
+ # fork to avoid being killed by parent process. See the reason we
311
+ # daemonize the process in `sky/skylet/subprocess_daemon.py`.
312
+ subprocess.Popen(
313
+ daemon_cmd,
314
+ # Suppress output
315
+ stdout=subprocess.DEVNULL,
316
+ stderr=subprocess.DEVNULL,
317
+ # Disable input
318
+ stdin=subprocess.DEVNULL,
319
+ )
320
+
321
+
322
+ def launch_new_process_tree(cmd: str, log_output: str = '/dev/null') -> int:
323
+ """Launch a new process that will not be a child of the current process.
324
+
325
+ This will launch bash in a new session, which will launch the given cmd.
326
+ This will ensure that cmd is in its own process tree, and once bash exits,
327
+ will not be an ancestor of the current process. This is useful for job
328
+ launching.
329
+
330
+ Returns the pid of the launched cmd.
331
+ """
332
+ # Use nohup to ensure the job driver process is a separate process tree,
333
+ # instead of being a child of the current process. This is important to
334
+ # avoid a chain of driver processes (job driver can call schedule_step() to
335
+ # submit new jobs, and the new job can also call schedule_step()
336
+ # recursively).
337
+ #
338
+ # echo $! will output the PID of the last background process started in the
339
+ # current shell, so we can retrieve it and record in the DB.
340
+ #
341
+ # TODO(zhwu): A more elegant solution is to use another daemon process to be
342
+ # in charge of starting these driver processes, instead of starting them in
343
+ # the current process.
344
+ wrapped_cmd = (f'nohup bash -c {shlex.quote(cmd)} '
345
+ f'</dev/null >{log_output} 2>&1 & echo $!')
346
+ proc = subprocess.run(wrapped_cmd,
347
+ stdout=subprocess.PIPE,
348
+ stderr=subprocess.PIPE,
349
+ stdin=subprocess.DEVNULL,
350
+ start_new_session=True,
351
+ check=True,
352
+ shell=True,
353
+ text=True)
354
+ # Get the PID of the detached process
355
+ return int(proc.stdout.strip())
sky/utils/timeline.py CHANGED
@@ -9,6 +9,7 @@ import json
9
9
  import os
10
10
  import threading
11
11
  import time
12
+ import traceback
12
13
  from typing import Callable, Optional, Union
13
14
 
14
15
  import filelock
@@ -48,8 +49,9 @@ class Event:
48
49
  'ph': 'B',
49
50
  'ts': f'{time.time() * 10 ** 6: .3f}',
50
51
  })
52
+ event_begin['args'] = {'stack': '\n'.join(traceback.format_stack())}
51
53
  if self._message is not None:
52
- event_begin['args'] = {'message': self._message}
54
+ event_begin['args']['message'] = self._message
53
55
  _events.append(event_begin)
54
56
 
55
57
  def end(self):
@@ -77,11 +79,11 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
77
79
  class FileLockEvent:
78
80
  """Serve both as a file lock and event for the lock."""
79
81
 
80
- def __init__(self, lockfile: Union[str, os.PathLike]):
82
+ def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
81
83
  self._lockfile = lockfile
82
- # TODO(mraheja): remove pylint disabling when filelock version updated
83
- # pylint: disable=abstract-class-instantiated
84
- self._lock = filelock.FileLock(self._lockfile)
84
+ os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
85
+ exist_ok=True)
86
+ self._lock = filelock.FileLock(self._lockfile, timeout)
85
87
  self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
86
88
 
87
89
  def acquire(self):
@@ -116,7 +118,10 @@ class FileLockEvent:
116
118
  return wrapper
117
119
 
118
120
 
119
- def _save_timeline(file_path: str):
121
+ def save_timeline():
122
+ file_path = os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
123
+ if not file_path:
124
+ return
120
125
  json_output = {
121
126
  'traceEvents': _events,
122
127
  'displayTimeUnit': 'ms',
@@ -130,4 +135,4 @@ def _save_timeline(file_path: str):
130
135
 
131
136
 
132
137
  if os.environ.get('SKYPILOT_TIMELINE_FILE_PATH'):
133
- atexit.register(_save_timeline, os.environ['SKYPILOT_TIMELINE_FILE_PATH'])
138
+ atexit.register(save_timeline)
sky/utils/ux_utils.py CHANGED
@@ -1,21 +1,42 @@
1
1
  """Utility functions for UX."""
2
2
  import contextlib
3
+ import enum
4
+ import os
3
5
  import sys
4
6
  import traceback
5
- from typing import Callable
7
+ import typing
8
+ from typing import Callable, Optional, Union
6
9
 
10
+ import colorama
7
11
  import rich.console as rich_console
8
12
 
9
13
  from sky import sky_logging
14
+ from sky.skylet import constants
10
15
  from sky.utils import common_utils
11
- from sky.utils import env_options
12
- from sky.utils import ux_utils
16
+
17
+ if typing.TYPE_CHECKING:
18
+ import pathlib
13
19
 
14
20
  console = rich_console.Console()
15
21
 
22
+ INDENT_SYMBOL = f'{colorama.Style.DIM}├── {colorama.Style.RESET_ALL}'
23
+ INDENT_LAST_SYMBOL = f'{colorama.Style.DIM}└── {colorama.Style.RESET_ALL}'
24
+
25
+ # Console formatting constants
26
+ BOLD = '\033[1m'
27
+ RESET_BOLD = '\033[0m'
28
+
29
+ # Log path hint in the spinner during launching
30
+ _LOG_PATH_HINT = (f'{colorama.Style.DIM}View logs: sky api logs -l '
31
+ '{log_path}'
32
+ f'{colorama.Style.RESET_ALL}')
33
+ _LOG_PATH_HINT_LOCAL = (f'{colorama.Style.DIM}View logs: '
34
+ '{log_path}'
35
+ f'{colorama.Style.RESET_ALL}')
36
+
16
37
 
17
38
  def console_newline():
18
- """Print a newline to the console using rich.
39
+ """Prints a newline to the console using rich.
19
40
 
20
41
  Useful when catching exceptions inside console.status()
21
42
  """
@@ -38,19 +59,15 @@ def print_exception_no_traceback():
38
59
  if error():
39
60
  raise ValueError('...')
40
61
  """
41
- if env_options.Options.SHOW_DEBUG_INFO.get():
42
- # When SKYPILOT_DEBUG is set, show the full traceback
43
- yield
44
- else:
45
- original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
46
- sys.tracebacklimit = 0
47
- yield
48
- sys.tracebacklimit = original_tracelimit
62
+ original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
63
+ sys.tracebacklimit = 0
64
+ yield
65
+ sys.tracebacklimit = original_tracelimit
49
66
 
50
67
 
51
68
  @contextlib.contextmanager
52
69
  def enable_traceback():
53
- """Revert the effect of print_exception_no_traceback().
70
+ """Reverts the effect of print_exception_no_traceback().
54
71
 
55
72
  This is used for usage_lib to collect the full traceback.
56
73
  """
@@ -61,7 +78,7 @@ def enable_traceback():
61
78
 
62
79
 
63
80
  class RedirectOutputForProcess:
64
- """Redirect stdout and stderr to a file.
81
+ """Redirects stdout and stderr to a file.
65
82
 
66
83
  This class enabled output redirect for multiprocessing.Process.
67
84
  Example usage:
@@ -99,6 +116,142 @@ class RedirectOutputForProcess:
99
116
  except Exception as e: # pylint: disable=broad-except
100
117
  logger.error(f'Failed to run {self.func.__name__}. '
101
118
  f'Details: {common_utils.format_exception(e)}')
102
- with ux_utils.enable_traceback():
119
+ with enable_traceback():
103
120
  logger.error(f' Traceback:\n{traceback.format_exc()}')
104
121
  raise
122
+
123
+
124
+ def log_path_hint(log_path: Union[str, 'pathlib.Path'],
125
+ is_local: bool = False) -> str:
126
+ """Gets the log path hint for the given log path."""
127
+ log_path = str(log_path)
128
+ expanded_home = os.path.expanduser('~')
129
+ if log_path.startswith(expanded_home):
130
+ log_path = '~' + log_path[len(expanded_home):]
131
+ if is_local:
132
+ return _LOG_PATH_HINT_LOCAL.format(log_path=log_path)
133
+ if log_path.startswith(constants.SKY_LOGS_DIRECTORY):
134
+ log_path = log_path[len(constants.SKY_LOGS_DIRECTORY):]
135
+ log_path = log_path.lstrip(os.path.sep)
136
+ return _LOG_PATH_HINT.format(log_path=log_path)
137
+
138
+
139
+ def starting_message(message: str) -> str:
140
+ """Gets the starting message for the given message."""
141
+ # We have to reset the color before the message, because sometimes if a
142
+ # previous spinner with dimmed color overflows in a narrow terminal, the
143
+ # color might be messed up.
144
+ return f'{colorama.Style.RESET_ALL}⚙︎ {message}'
145
+
146
+
147
+ def finishing_message(message: str,
148
+ log_path: Optional[Union[str, 'pathlib.Path']] = None,
149
+ is_local: bool = False,
150
+ follow_up_message: Optional[str] = None) -> str:
151
+ """Gets the finishing message for the given message.
152
+
153
+ Args:
154
+ message: The main message to be displayed.
155
+ log_path: The log path to be displayed in the message.
156
+ is_local: Whether the log path is local or on remote API server.
157
+ follow_up_message: A message to be displayed after the main message.
158
+ The follow up message is not colored.
159
+ """
160
+ # We have to reset the color before the message, because sometimes if a
161
+ # previous spinner with dimmed color overflows in a narrow terminal, the
162
+ # color might be messed up.
163
+ follow_up_message = follow_up_message if (follow_up_message
164
+ is not None) else ''
165
+ success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
166
+ f'{message}{colorama.Style.RESET_ALL}{follow_up_message}')
167
+ if log_path is None:
168
+ return success_prefix
169
+ path_hint = log_path_hint(log_path, is_local)
170
+ return f'{success_prefix} {path_hint}'
171
+
172
+
173
+ def error_message(message: str,
174
+ log_path: Optional[Union[str, 'pathlib.Path']] = None,
175
+ is_local: bool = False) -> str:
176
+ """Gets the error message for the given message."""
177
+ # We have to reset the color before the message, because sometimes if a
178
+ # previous spinner with dimmed color overflows in a narrow terminal, the
179
+ # color might be messed up.
180
+ error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
181
+ f'{colorama.Style.RESET_ALL} {message}')
182
+ if log_path is None:
183
+ return error_prefix
184
+ path_hint = log_path_hint(log_path, is_local)
185
+ return f'{error_prefix} {path_hint}'
186
+
187
+
188
+ def retry_message(message: str) -> str:
189
+ """Gets the retry message for the given message."""
190
+ # We have to reset the color before the message, because sometimes if a
191
+ # previous spinner with dimmed color overflows in a narrow terminal, the
192
+ # color might be messed up.
193
+ return (f'{colorama.Style.RESET_ALL}{colorama.Fore.YELLOW}↺'
194
+ f'{colorama.Style.RESET_ALL} {message}')
195
+
196
+
197
+ def spinner_message(message: str,
198
+ log_path: Optional[Union[str, 'pathlib.Path']] = None,
199
+ is_local: bool = False) -> str:
200
+ """Gets the spinner message for the given message and log path."""
201
+ colored_spinner = f'[bold cyan]{message}[/]'
202
+ if log_path is None:
203
+ return colored_spinner
204
+ path_hint = log_path_hint(log_path, is_local)
205
+ return f'{colored_spinner} {path_hint}'
206
+
207
+
208
+ class CommandHintType(enum.Enum):
209
+ CLUSTER_JOB = 'cluster_job'
210
+ MANAGED_JOB = 'managed_job'
211
+
212
+
213
+ def command_hint_messages(hint_type: CommandHintType,
214
+ job_id: Optional[str] = None,
215
+ cluster_name: Optional[str] = None) -> str:
216
+ """Gets the command hint messages for the given job id."""
217
+ if hint_type == CommandHintType.CLUSTER_JOB:
218
+ job_hint_str = (f'\nJob ID: {job_id}'
219
+ f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
220
+ f'{BOLD}sky cancel {cluster_name} {job_id}{RESET_BOLD}'
221
+ f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
222
+ f'{BOLD}sky logs {cluster_name} {job_id}{RESET_BOLD}'
223
+ f'\n{INDENT_LAST_SYMBOL}To view job queue:\t\t'
224
+ f'{BOLD}sky queue {cluster_name}{RESET_BOLD}')
225
+ cluster_hint_str = (f'\nCluster name: {cluster_name}'
226
+ f'\n{INDENT_SYMBOL}To log into the head VM:\t'
227
+ f'{BOLD}ssh {cluster_name}'
228
+ f'{RESET_BOLD}'
229
+ f'\n{INDENT_SYMBOL}To submit a job:'
230
+ f'\t\t{BOLD}sky exec {cluster_name} yaml_file'
231
+ f'{RESET_BOLD}'
232
+ f'\n{INDENT_SYMBOL}To stop the cluster:'
233
+ f'\t{BOLD}sky stop {cluster_name}'
234
+ f'{RESET_BOLD}'
235
+ f'\n{INDENT_LAST_SYMBOL}To teardown the cluster:'
236
+ f'\t{BOLD}sky down {cluster_name}'
237
+ f'{RESET_BOLD}')
238
+ hint_str = '\n📋 Useful Commands'
239
+ if job_id is not None:
240
+ hint_str += f'{job_hint_str}'
241
+ hint_str += f'{cluster_hint_str}'
242
+ return hint_str
243
+ elif hint_type == CommandHintType.MANAGED_JOB:
244
+ return (f'\n📋 Useful Commands'
245
+ f'\nManaged Job ID: {job_id}'
246
+ f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
247
+ f'{BOLD}sky jobs cancel {job_id}{RESET_BOLD}'
248
+ f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
249
+ f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
250
+ f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
251
+ f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
252
+ f'\n{INDENT_SYMBOL}To view all managed jobs:\t\t'
253
+ f'{BOLD}sky jobs queue{RESET_BOLD}'
254
+ f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
255
+ f'{BOLD}sky jobs dashboard{RESET_BOLD}')
256
+ else:
257
+ raise ValueError(f'Invalid hint type: {hint_type}')