skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  This is a remote utility module that provides logging functionality.
4
4
  """
5
+ import collections
5
6
  import copy
6
7
  import io
7
8
  import multiprocessing.pool
@@ -12,7 +13,8 @@ import sys
12
13
  import tempfile
13
14
  import textwrap
14
15
  import time
15
- from typing import Dict, Iterator, List, Optional, Tuple, Union
16
+ from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
17
+ Tuple, Union)
16
18
 
17
19
  import colorama
18
20
 
@@ -21,13 +23,19 @@ from sky.skylet import constants
21
23
  from sky.skylet import job_lib
22
24
  from sky.utils import log_utils
23
25
  from sky.utils import subprocess_utils
26
+ from sky.utils import ux_utils
24
27
 
25
- _SKY_LOG_WAITING_GAP_SECONDS = 1
26
- _SKY_LOG_WAITING_MAX_RETRY = 5
27
- _SKY_LOG_TAILING_GAP_SECONDS = 0.2
28
+ SKY_LOG_WAITING_GAP_SECONDS = 1
29
+ SKY_LOG_WAITING_MAX_RETRY = 5
30
+ SKY_LOG_TAILING_GAP_SECONDS = 0.2
31
+ # Peek the head of the lines to check if we need to start
32
+ # streaming when tail > 0.
33
+ PEEK_HEAD_LINES_FOR_START_STREAM = 20
28
34
 
29
35
  logger = sky_logging.init_logger(__name__)
30
36
 
37
+ LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
38
+
31
39
 
32
40
  class _ProcessingArgs:
33
41
  """Arguments for processing logs."""
@@ -170,53 +178,19 @@ def run_with_log(
170
178
  if process_stream:
171
179
  stdout_arg = subprocess.PIPE
172
180
  stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
181
+ # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
182
+ # the terminal output when typing in the terminal that starts the API
183
+ # server.
184
+ stdin = kwargs.pop('stdin', subprocess.DEVNULL)
173
185
  with subprocess.Popen(cmd,
174
186
  stdout=stdout_arg,
175
187
  stderr=stderr_arg,
176
188
  start_new_session=True,
177
189
  shell=shell,
190
+ stdin=stdin,
178
191
  **kwargs) as proc:
179
192
  try:
180
- # The proc can be defunct if the python program is killed. Here we
181
- # open a new subprocess to gracefully kill the proc, SIGTERM
182
- # and then SIGKILL the process group.
183
- # Adapted from ray/dashboard/modules/job/job_manager.py#L154
184
- parent_pid = os.getpid()
185
- daemon_script = os.path.join(
186
- os.path.dirname(os.path.abspath(job_lib.__file__)),
187
- 'subprocess_daemon.py')
188
- if not hasattr(constants, 'SKY_GET_PYTHON_PATH_CMD'):
189
- # Backward compatibility: for cluster started before #3326, this
190
- # constant does not exist. Since we generate the job script
191
- # in backends.cloud_vm_ray_backend with inspect, so the
192
- # the lates `run_with_log` will be used, but the `constants` is
193
- # not updated. We fallback to `python3` in this case.
194
- # TODO(zhwu): remove this after 0.7.0.
195
- python_path = 'python3'
196
- else:
197
- python_path = subprocess.check_output(
198
- constants.SKY_GET_PYTHON_PATH_CMD,
199
- shell=True,
200
- stderr=subprocess.DEVNULL,
201
- encoding='utf-8').strip()
202
- daemon_cmd = [
203
- python_path,
204
- daemon_script,
205
- '--parent-pid',
206
- str(parent_pid),
207
- '--proc-pid',
208
- str(proc.pid),
209
- ]
210
-
211
- subprocess.Popen(
212
- daemon_cmd,
213
- start_new_session=True,
214
- # Suppress output
215
- stdout=subprocess.DEVNULL,
216
- stderr=subprocess.DEVNULL,
217
- # Disable input
218
- stdin=subprocess.DEVNULL,
219
- )
193
+ subprocess_utils.kill_process_daemon(proc.pid)
220
194
  stdout = ''
221
195
  stderr = ''
222
196
 
@@ -263,6 +237,9 @@ def make_task_bash_script(codegen: str,
263
237
  # set -a is used for exporting all variables functions to the environment
264
238
  # so that bash `user_script` can access `conda activate`. Detail: #436.
265
239
  # Reference: https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html # pylint: disable=line-too-long
240
+ # DEACTIVATE_SKY_REMOTE_PYTHON_ENV: Deactivate the SkyPilot runtime env, as
241
+ # the ray cluster is started within the runtime env, which may cause the
242
+ # user program to run in that env as well.
266
243
  # PYTHONUNBUFFERED is used to disable python output buffering.
267
244
  script = [
268
245
  textwrap.dedent(f"""\
@@ -271,6 +248,7 @@ def make_task_bash_script(codegen: str,
271
248
  set -a
272
249
  . $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true
273
250
  set +a
251
+ {constants.DEACTIVATE_SKY_REMOTE_PYTHON_ENV}
274
252
  export PYTHONUNBUFFERED=1
275
253
  cd {constants.SKY_REMOTE_WORKDIR}"""),
276
254
  ]
@@ -316,21 +294,16 @@ def run_bash_command_with_log(bash_command: str,
316
294
  # Need this `-i` option to make sure `source ~/.bashrc` work.
317
295
  inner_command = f'/bin/bash -i {script_path}'
318
296
 
319
- subprocess_cmd: Union[str, List[str]]
320
- subprocess_cmd = inner_command
321
-
322
- return run_with_log(
323
- subprocess_cmd,
324
- log_path,
325
- stream_logs=stream_logs,
326
- with_ray=with_ray,
327
- # Disable input to avoid blocking.
328
- stdin=subprocess.DEVNULL,
329
- shell=True)
297
+ return run_with_log(inner_command,
298
+ log_path,
299
+ stream_logs=stream_logs,
300
+ with_ray=with_ray,
301
+ shell=True)
330
302
 
331
303
 
332
304
  def _follow_job_logs(file,
333
305
  job_id: int,
306
+ start_streaming: bool,
334
307
  start_streaming_at: str = '') -> Iterator[str]:
335
308
  """Yield each line from a file as they are written.
336
309
 
@@ -339,7 +312,6 @@ def _follow_job_logs(file,
339
312
  # No need to lock the status here, as the while loop can handle
340
313
  # the older status.
341
314
  status = job_lib.get_status_no_lock(job_id)
342
- start_streaming = False
343
315
  wait_last_logs = True
344
316
  while True:
345
317
  tmp = file.readline()
@@ -366,21 +338,58 @@ def _follow_job_logs(file,
366
338
  ]:
367
339
  if wait_last_logs:
368
340
  # Wait all the logs are printed before exit.
369
- time.sleep(1 + _SKY_LOG_TAILING_GAP_SECONDS)
341
+ time.sleep(1 + SKY_LOG_TAILING_GAP_SECONDS)
370
342
  wait_last_logs = False
371
343
  continue
372
344
  status_str = status.value if status is not None else 'None'
373
- print(f'INFO: Job finished (status: {status_str}).')
345
+ print(ux_utils.finishing_message(
346
+ f'Job finished (status: {status_str}).'),
347
+ flush=True)
374
348
  return
375
349
 
376
- time.sleep(_SKY_LOG_TAILING_GAP_SECONDS)
350
+ time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
377
351
  status = job_lib.get_status_no_lock(job_id)
378
352
 
379
353
 
354
+ def _peek_head_lines(log_file: TextIO) -> List[str]:
355
+ """Peek the head of the file."""
356
+ lines = [
357
+ log_file.readline() for _ in range(PEEK_HEAD_LINES_FOR_START_STREAM)
358
+ ]
359
+ # Reset the file pointer to the beginning
360
+ log_file.seek(0, os.SEEK_SET)
361
+ return [line for line in lines if line]
362
+
363
+
364
+ def _should_stream_the_whole_tail_lines(head_lines_of_log_file: List[str],
365
+ tail_lines: Deque[str],
366
+ start_stream_at: str) -> bool:
367
+ """Check if the entire tail lines should be streamed."""
368
+ # See comment:
369
+ # https://github.com/skypilot-org/skypilot/pull/4241#discussion_r1833611567
370
+ # for more details.
371
+ # Case 1: If start_stream_at is found at the head of the tail lines,
372
+ # we should not stream the whole tail lines.
373
+ for index, line in enumerate(tail_lines):
374
+ if index >= PEEK_HEAD_LINES_FOR_START_STREAM:
375
+ break
376
+ if start_stream_at in line:
377
+ return False
378
+ # Case 2: If start_stream_at is found at the head of log file, but not at
379
+ # the tail lines, we need to stream the whole tail lines.
380
+ for line in head_lines_of_log_file:
381
+ if start_stream_at in line:
382
+ return True
383
+ # Case 3: If start_stream_at is not at the head, and not found at the tail
384
+ # lines, we should not stream the whole tail lines.
385
+ return False
386
+
387
+
380
388
  def tail_logs(job_id: Optional[int],
381
389
  log_dir: Optional[str],
382
390
  managed_job_id: Optional[int] = None,
383
- follow: bool = True) -> None:
391
+ follow: bool = True,
392
+ tail: int = 0) -> None:
384
393
  """Tail the logs of a job.
385
394
 
386
395
  Args:
@@ -389,6 +398,8 @@ def tail_logs(job_id: Optional[int],
389
398
  managed_job_id: The managed job id (for logging info only to avoid
390
399
  confusion).
391
400
  follow: Whether to follow the logs or print the logs so far and exit.
401
+ tail: The number of lines to display from the end of the log file,
402
+ if 0, print all lines.
392
403
  """
393
404
  if job_id is None:
394
405
  # This only happens when job_lib.get_latest_job_id() returns None,
@@ -405,8 +416,6 @@ def tail_logs(job_id: Optional[int],
405
416
  return
406
417
  logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
407
418
  f'{managed_job_id}.')
408
- logger.info(f'{colorama.Fore.YELLOW}Start streaming logs for {job_str}.'
409
- f'{colorama.Style.RESET_ALL}')
410
419
  log_path = os.path.join(log_dir, 'run.log')
411
420
  log_path = os.path.expanduser(log_path)
412
421
 
@@ -419,18 +428,20 @@ def tail_logs(job_id: Optional[int],
419
428
  retry_cnt += 1
420
429
  if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
421
430
  break
422
- if retry_cnt >= _SKY_LOG_WAITING_MAX_RETRY:
431
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
423
432
  print(
424
433
  f'{colorama.Fore.RED}ERROR: Logs for '
425
434
  f'{job_str} (status: {status.value}) does not exist '
426
435
  f'after retrying {retry_cnt} times.{colorama.Style.RESET_ALL}')
427
436
  return
428
- print(f'INFO: Waiting {_SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
437
+ print(f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
429
438
  'to be written...')
430
- time.sleep(_SKY_LOG_WAITING_GAP_SECONDS)
439
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
431
440
  status = job_lib.update_job_status([job_id], silent=True)[0]
432
441
 
433
- start_stream_at = 'INFO: Tip: use Ctrl-C to exit log'
442
+ start_stream_at = LOG_FILE_START_STREAMING_AT
443
+ # Explicitly declare the type to avoid mypy warning.
444
+ lines: Iterable[str] = []
434
445
  if follow and status in [
435
446
  job_lib.JobStatus.SETTING_UP,
436
447
  job_lib.JobStatus.PENDING,
@@ -441,19 +452,48 @@ def tail_logs(job_id: Optional[int],
441
452
  with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
442
453
  # Using `_follow` instead of `tail -f` to streaming the whole
443
454
  # log and creating a new process for tail.
455
+ start_streaming = False
456
+ if tail > 0:
457
+ head_lines_of_log_file = _peek_head_lines(log_file)
458
+ lines = collections.deque(log_file, maxlen=tail)
459
+ start_streaming = _should_stream_the_whole_tail_lines(
460
+ head_lines_of_log_file, lines, start_stream_at)
461
+ for line in lines:
462
+ if start_stream_at in line:
463
+ start_streaming = True
464
+ if start_streaming:
465
+ print(line, end='')
466
+ # Flush the last n lines
467
+ print(end='', flush=True)
468
+ # Now, the cursor is at the end of the last lines
469
+ # if tail > 0
444
470
  for line in _follow_job_logs(log_file,
445
471
  job_id=job_id,
472
+ start_streaming=start_streaming,
446
473
  start_streaming_at=start_stream_at):
447
474
  print(line, end='', flush=True)
448
475
  else:
449
476
  try:
450
- start_stream = False
451
- with open(log_path, 'r', encoding='utf-8') as f:
452
- for line in f.readlines():
477
+ start_streaming = False
478
+ with open(log_path, 'r', encoding='utf-8') as log_file:
479
+ if tail > 0:
480
+ # If tail > 0, we need to read the last n lines.
481
+ # We use double ended queue to rotate the last n lines.
482
+ head_lines_of_log_file = _peek_head_lines(log_file)
483
+ lines = collections.deque(log_file, maxlen=tail)
484
+ start_streaming = _should_stream_the_whole_tail_lines(
485
+ head_lines_of_log_file, lines, start_stream_at)
486
+ else:
487
+ lines = log_file
488
+ for line in lines:
453
489
  if start_stream_at in line:
454
- start_stream = True
455
- if start_stream:
490
+ start_streaming = True
491
+ if start_streaming:
456
492
  print(line, end='', flush=True)
493
+ status_str = status.value if status is not None else 'None'
494
+ print(ux_utils.finishing_message(
495
+ f'Job finished (status: {status_str}).'),
496
+ flush=True)
457
497
  except FileNotFoundError:
458
498
  print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
459
499
  f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
sky/skylet/log_lib.pyi CHANGED
@@ -13,6 +13,12 @@ from sky.skylet import constants as constants
13
13
  from sky.skylet import job_lib as job_lib
14
14
  from sky.utils import log_utils as log_utils
15
15
 
16
+ SKY_LOG_WAITING_GAP_SECONDS: int = ...
17
+ SKY_LOG_WAITING_MAX_RETRY: int = ...
18
+ SKY_LOG_TAILING_GAP_SECONDS: float = ...
19
+ LOG_FILE_START_STREAMING_AT: str = ...
20
+
21
+
16
22
  class _ProcessingArgs:
17
23
  log_path: str
18
24
  stream_logs: bool
@@ -25,7 +25,7 @@ def docker_start_cmds(
25
25
  docker_cmd,
26
26
  ):
27
27
  """Generating docker start command without --rm.
28
-
28
+
29
29
  The code is borrowed from `ray.autoscaler._private.docker`.
30
30
 
31
31
  Changes we made:
@@ -65,8 +65,8 @@ def docker_start_cmds(
65
65
  '--cap-add=SYS_ADMIN',
66
66
  '--device=/dev/fuse',
67
67
  '--security-opt=apparmor:unconfined',
68
+ '--entrypoint=/bin/bash',
68
69
  image,
69
- 'bash',
70
70
  ]
71
71
  return ' '.join(docker_run)
72
72
 
@@ -159,19 +159,17 @@ class SkyDockerCommandRunner(DockerCommandRunner):
159
159
  return True
160
160
 
161
161
  # SkyPilot: Docker login if user specified a private docker registry.
162
- if "docker_login_config" in self.docker_config:
162
+ if 'docker_login_config' in self.docker_config:
163
163
  # TODO(tian): Maybe support a command to get the login password?
164
- docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[
165
- "docker_login_config"]
164
+ docker_login_config: docker_utils.DockerLoginConfig = (
165
+ self.docker_config['docker_login_config'])
166
166
  self._run_with_retry(
167
167
  f'{self.docker_cmd} login --username '
168
168
  f'{docker_login_config.username} --password '
169
169
  f'{docker_login_config.password} {docker_login_config.server}')
170
170
  # We automatically add the server prefix to the image name if
171
171
  # the user did not add it.
172
- server_prefix = f'{docker_login_config.server}/'
173
- if not specific_image.startswith(server_prefix):
174
- specific_image = f'{server_prefix}{specific_image}'
172
+ specific_image = docker_login_config.format_image(specific_image)
175
173
 
176
174
  if self.docker_config.get('pull_before_run', True):
177
175
  assert specific_image, ('Image must be included in config if '
@@ -377,7 +377,7 @@ class IBMVPCNodeProvider(NodeProvider):
377
377
  node["id"], nic_id
378
378
  ).get_result()
379
379
  floating_ips = res["floating_ips"]
380
- if len(floating_ips) == 0:
380
+ if not floating_ips:
381
381
  # not adding a node that's yet/failed to
382
382
  # to get a floating ip provisioned
383
383
  continue
@@ -485,7 +485,7 @@ class IBMVPCNodeProvider(NodeProvider):
485
485
  """Returns instance (node) information matching the specified name"""
486
486
 
487
487
  instances_data = self.ibm_vpc_client.list_instances(name=name).get_result()
488
- if len(instances_data["instances"]) > 0:
488
+ if instances_data["instances"]:
489
489
  return instances_data["instances"][0]
490
490
  return None
491
491
 
@@ -107,20 +107,28 @@ class ZoneConfig:
107
107
  for item in subnet_contents
108
108
  if item['subnetState'] == 'ACTIVE' and item["vpcId"] == vpc
109
109
  ]
110
- if len(subnet_list) > 0:
110
+ if subnet_list:
111
111
  vpc_subnets[vpc] = subnet_list
112
112
 
113
113
  return vpc_subnets
114
114
 
115
115
  def _get_vm_init_script(self, ssh_public_key):
116
116
 
117
+ import subprocess
117
118
  init_script_content = self._get_default_config_cmd(
118
119
  ) + self._get_ssh_key_gen_cmd(ssh_public_key)
120
+ init_script_content_string = f'"{init_script_content}"'
121
+ command = f'echo {init_script_content_string} | base64'
122
+ result = subprocess.run(command,
123
+ shell=True,
124
+ capture_output=True,
125
+ text=True)
126
+ init_script_content_base64 = result.stdout
119
127
  return {
120
- "encodingType": "plain",
128
+ "encodingType": "base64",
121
129
  "initialScriptShell": "bash",
122
130
  "initialScriptType": "text",
123
- "initialScriptContent": init_script_content
131
+ "initialScriptContent": init_script_content_base64
124
132
  }
125
133
 
126
134
  def _get_ssh_key_gen_cmd(self, ssh_public_key):
@@ -180,7 +180,7 @@ class SCPNodeProvider(NodeProvider):
180
180
  metadata['tags'] = instance_info['tags']
181
181
  # TODO(ewzeng): The internal ip is hard to get, so set it to the
182
182
  # external ip as a hack. This should be changed in the future.
183
- # https://docs.lambdalabs.com/cloud/learn-private-ip-address/
183
+ # https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
184
184
  metadata['internal_ip'] = vm['ip']
185
185
  metadata['external_ip'] = vm['external_ip']
186
186
  return metadata
@@ -259,7 +259,7 @@ class SCPNodeProvider(NodeProvider):
259
259
  for sg in sg_contents
260
260
  if sg["securityGroupId"] == sg_id
261
261
  ]
262
- if len(sg) != 0 and sg[0] == "ACTIVE":
262
+ if sg and sg[0] == "ACTIVE":
263
263
  break
264
264
  time.sleep(5)
265
265
 
@@ -282,16 +282,16 @@ class SCPNodeProvider(NodeProvider):
282
282
  for sg in sg_contents
283
283
  if sg["securityGroupId"] == sg_id
284
284
  ]
285
- if len(sg) == 0:
285
+ if not sg:
286
286
  break
287
287
 
288
288
  def _refresh_security_group(self, vms):
289
- if len(vms) > 0:
289
+ if vms:
290
290
  return
291
291
  # remove security group if vm does not exist
292
292
  keys = self.metadata.keys()
293
293
  security_group_id = self.metadata[
294
- keys[0]]['creation']['securityGroupId'] if len(keys) > 0 else None
294
+ keys[0]]['creation']['securityGroupId'] if keys else None
295
295
  if security_group_id:
296
296
  try:
297
297
  self._del_security_group(security_group_id)
@@ -308,7 +308,7 @@ class SCPNodeProvider(NodeProvider):
308
308
  for vm in vm_contents
309
309
  if vm["virtualServerId"] == vm_id
310
310
  ]
311
- if len(vms) == 0:
311
+ if not vms:
312
312
  break
313
313
 
314
314
  def _del_firwall_rules(self, firewall_id, rule_ids):
@@ -391,7 +391,7 @@ class SCPNodeProvider(NodeProvider):
391
391
  return None, None, None, None
392
392
 
393
393
  def _undo_funcs(self, undo_func_list):
394
- while len(undo_func_list) > 0:
394
+ while undo_func_list:
395
395
  func = undo_func_list.pop()
396
396
  func()
397
397
 
@@ -468,7 +468,7 @@ class SCPNodeProvider(NodeProvider):
468
468
 
469
469
  zone_config = ZoneConfig(self.scp_client, node_config)
470
470
  vpc_subnets = zone_config.get_vcp_subnets()
471
- if (len(vpc_subnets) == 0):
471
+ if not vpc_subnets:
472
472
  raise SCPError("This region/zone does not have available VPCs.")
473
473
 
474
474
  instance_config = zone_config.bootstrap_instance_config(node_config)
sky/skylet/skylet.py CHANGED
@@ -20,11 +20,13 @@ EVENTS = [
20
20
  # The managed job update event should be after the job update event.
21
21
  # Otherwise, the abnormal managed job status update will be delayed
22
22
  # until the next job update event.
23
- events.ManagedJobUpdateEvent(),
23
+ events.ManagedJobEvent(),
24
24
  # This is for monitoring controller job status. If it becomes
25
25
  # unhealthy, this event will correctly update the controller
26
26
  # status to CONTROLLER_FAILED.
27
27
  events.ServiceUpdateEvent(),
28
+ # Report usage heartbeat every 10 minutes.
29
+ events.UsageHeartbeatReportEvent(),
28
30
  ]
29
31
 
30
32
  while True:
@@ -1,20 +1,57 @@
1
1
  """Sky subprocess daemon.
2
-
3
2
  Wait for parent_pid to exit, then SIGTERM (or SIGKILL if needed) the child
4
3
  processes of proc_pid.
5
4
  """
6
-
7
5
  import argparse
6
+ import os
8
7
  import sys
9
8
  import time
10
9
 
11
10
  import psutil
12
11
 
13
- if __name__ == '__main__':
14
12
 
13
+ def daemonize():
14
+ """Detaches the process from its parent process with double-forking.
15
+
16
+ This detachment is crucial in the context of SkyPilot and Ray job. When
17
+ 'sky cancel' is executed, it uses Ray's stop job API to terminate the job.
18
+ Without daemonization, this subprocess_daemon process will still be a child
19
+ of the parent process which would be terminated along with the parent
20
+ process, ray::task or the cancel request for jobs, which is launched with
21
+ Ray job. Daemonization ensures this process survives the 'sky cancel'
22
+ command, allowing it to prevent orphaned processes of Ray job.
23
+ """
24
+ # First fork: Creates a child process identical to the parent
25
+ if os.fork() > 0:
26
+ # Parent process exits, allowing the child to run independently
27
+ sys.exit()
28
+
29
+ # Continues to run from first forked child process.
30
+ # Detach from parent environment.
31
+ os.setsid()
32
+
33
+ # Second fork: Creates a grandchild process
34
+ if os.fork() > 0:
35
+ # First child exits, orphaning the grandchild
36
+ sys.exit()
37
+ # Continues execution in the grandchild process
38
+ # This process is now fully detached from the original parent and terminal
39
+
40
+
41
+ if __name__ == '__main__':
42
+ daemonize()
15
43
  parser = argparse.ArgumentParser()
16
44
  parser.add_argument('--parent-pid', type=int, required=True)
17
45
  parser.add_argument('--proc-pid', type=int, required=True)
46
+ parser.add_argument(
47
+ '--initial-children',
48
+ type=str,
49
+ default='',
50
+ help=(
51
+ 'Comma-separated list of initial children PIDs. This is to guard '
52
+ 'against the case where the target process has already terminated, '
53
+ 'while the children are still running.'),
54
+ )
18
55
  args = parser.parse_args()
19
56
 
20
57
  process = None
@@ -25,32 +62,47 @@ if __name__ == '__main__':
25
62
  except psutil.NoSuchProcess:
26
63
  pass
27
64
 
28
- if process is None:
29
- sys.exit()
65
+ # Initialize children list from arguments
66
+ children = []
67
+ if args.initial_children:
68
+ for pid in args.initial_children.split(','):
69
+ try:
70
+ child = psutil.Process(int(pid))
71
+ children.append(child)
72
+ except (psutil.NoSuchProcess, ValueError):
73
+ pass
30
74
 
31
- if parent_process is not None:
32
- # Wait for either parent or target process to exit.
75
+ if process is not None and parent_process is not None:
76
+ # Wait for either parent or target process to exit
33
77
  while process.is_running() and parent_process.is_running():
78
+ try:
79
+ tmp_children = process.children(recursive=True)
80
+ if tmp_children:
81
+ children = tmp_children
82
+ except psutil.NoSuchProcess:
83
+ pass
34
84
  time.sleep(1)
35
85
 
36
- try:
37
- children = process.children(recursive=True)
38
- children.append(process)
39
- except psutil.NoSuchProcess:
86
+ if process is not None:
87
+ # Kill the target process first to avoid having more children, or fail
88
+ # the process due to the children being defunct.
89
+ children = [process] + children
90
+
91
+ if not children:
40
92
  sys.exit()
41
93
 
42
- for pid in children:
94
+ for child in children:
43
95
  try:
44
- pid.terminate()
96
+ child.terminate()
45
97
  except psutil.NoSuchProcess:
46
- pass
98
+ continue
47
99
 
48
100
  # Wait 30s for the processes to exit gracefully.
49
101
  time.sleep(30)
50
102
 
51
103
  # SIGKILL if they're still running.
52
- for pid in children:
104
+ for child in children:
53
105
  try:
54
- pid.kill()
106
+ child.kill()
55
107
  except psutil.NoSuchProcess:
56
- pass
108
+ continue