skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,24 @@
1
+ # When using pod@namespace+context, rsync passes args as: {us} -l pod namespace+context
2
+ # We need to split the pod@namespace+context into pod, namespace and context
3
+ # For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
4
+ shift
5
+ pod=$1
6
+ shift
7
+ echo "pod: $pod" >&2
8
+ encoded_namespace_context=$1
9
+ # Revert the encoded namespace+context to the original string.
10
+ namespace_context=$(echo "$encoded_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
11
+ echo "namespace_context: $namespace_context" >&2
12
+ namespace=$(echo $namespace_context | cut -d+ -f1)
13
+ echo "namespace: $namespace" >&2
14
+ context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
15
+ echo "context: $context" >&2
16
+ context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
17
+ shift
18
+ if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
19
+ # If context is none, it means we are using incluster auth. In this case,
20
+ # use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
21
+ kubectl exec -i $pod -n $namespace --kubeconfig=/dev/null -- "$@"
22
+ else
23
+ kubectl exec -i $pod -n $namespace --context=$context -- "$@"
24
+ fi
@@ -126,7 +126,7 @@ def manage_lifecycle():
126
126
  f'error: {e}\n')
127
127
  raise
128
128
 
129
- if len(ret.items) == 0:
129
+ if not ret.items:
130
130
  sys.stdout.write(
131
131
  f'[Lifecycle] Did not find pods with label '
132
132
  f'"{label_selector}" in namespace {current_namespace}\n')
sky/utils/log_utils.py CHANGED
@@ -1,13 +1,18 @@
1
1
  """Logging utils."""
2
2
  import enum
3
- from typing import List, Optional
3
+ import time
4
+ import types
5
+ from typing import Callable, Iterator, List, Optional, TextIO, Type
4
6
 
5
7
  import colorama
8
+ # slow due to https://github.com/python-pendulum/pendulum/issues/808
9
+ # FIXME(aylei): bump pendulum if it get fixed
6
10
  import pendulum
7
11
  import prettytable
8
12
 
9
13
  from sky import sky_logging
10
14
  from sky.utils import rich_utils
15
+ from sky.utils import ux_utils
11
16
 
12
17
  logger = sky_logging.init_logger(__name__)
13
18
 
@@ -15,13 +20,15 @@ logger = sky_logging.init_logger(__name__)
15
20
  class LineProcessor(object):
16
21
  """A processor for log lines."""
17
22
 
18
- def __enter__(self):
23
+ def __enter__(self) -> None:
19
24
  pass
20
25
 
21
- def process_line(self, log_line):
26
+ def process_line(self, log_line: str) -> None:
22
27
  pass
23
28
 
24
- def __exit__(self, except_type, except_value, traceback):
29
+ def __exit__(self, except_type: Optional[Type[BaseException]],
30
+ except_value: Optional[BaseException],
31
+ traceback: Optional[types.TracebackType]) -> None:
25
32
  del except_type, except_value, traceback # unused
26
33
  pass
27
34
 
@@ -34,33 +41,39 @@ class RayUpLineProcessor(LineProcessor):
34
41
  RUNTIME_SETUP = 1
35
42
  PULLING_DOCKER_IMAGES = 2
36
43
 
37
- def __enter__(self):
44
+ def __init__(self, log_path: str):
45
+ self.log_path = log_path
46
+
47
+ def __enter__(self) -> None:
38
48
  self.state = self.ProvisionStatus.LAUNCH
39
- self.status_display = rich_utils.safe_status('[bold cyan]Launching')
49
+ self.status_display = rich_utils.safe_status(
50
+ ux_utils.spinner_message('Launching', self.log_path))
40
51
  self.status_display.start()
41
52
 
42
- def process_line(self, log_line):
53
+ def process_line(self, log_line: str) -> None:
43
54
  if ('Success.' in log_line and
44
55
  self.state == self.ProvisionStatus.LAUNCH):
45
- logger.info(f'{colorama.Fore.GREEN}Head node is up.'
46
- f'{colorama.Style.RESET_ALL}')
56
+ logger.info(' Head VM is up.')
47
57
  self.status_display.update(
48
- '[bold cyan]Launching - Preparing SkyPilot runtime')
58
+ ux_utils.spinner_message(
59
+ 'Launching - Preparing SkyPilot runtime', self.log_path))
49
60
  self.state = self.ProvisionStatus.RUNTIME_SETUP
50
61
  if ('Pulling from' in log_line and
51
62
  self.state == self.ProvisionStatus.RUNTIME_SETUP):
52
63
  self.status_display.update(
53
- '[bold cyan]Launching - Pulling docker images')
64
+ ux_utils.spinner_message(
65
+ 'Launching - Initializing docker container', self.log_path))
54
66
  self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
55
67
  if ('Status: Downloaded newer image' in log_line and
56
68
  self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
57
- logger.info(f'{colorama.Fore.GREEN}Docker image is downloaded.'
58
- f'{colorama.Style.RESET_ALL}')
59
69
  self.status_display.update(
60
- '[bold cyan]Launching - Preparing SkyPilot runtime')
70
+ ux_utils.spinner_message(
71
+ 'Launching - Preparing SkyPilot runtime', self.log_path))
61
72
  self.state = self.ProvisionStatus.RUNTIME_SETUP
62
73
 
63
- def __exit__(self, except_type, except_value, traceback):
74
+ def __exit__(self, except_type: Optional[Type[BaseException]],
75
+ except_value: Optional[BaseException],
76
+ traceback: Optional[types.TracebackType]) -> None:
64
77
  del except_type, except_value, traceback # unused
65
78
  self.status_display.stop()
66
79
 
@@ -68,42 +81,69 @@ class RayUpLineProcessor(LineProcessor):
68
81
  class SkyLocalUpLineProcessor(LineProcessor):
69
82
  """A processor for `sky local up` log lines."""
70
83
 
84
+ def __init__(self, log_path: str, is_local: bool):
85
+ self.log_path = log_path
86
+ self.is_local = is_local
87
+
71
88
  def __enter__(self):
72
- status = rich_utils.safe_status('[bold cyan]Creating local cluster - '
73
- 'initializing Kubernetes')
89
+ # TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
90
+ # messages.
91
+ msg = 'Creating local cluster - initializing Kubernetes'
92
+ status = rich_utils.safe_status(
93
+ ux_utils.spinner_message(msg,
94
+ log_path=self.log_path,
95
+ is_local=self.is_local))
74
96
  self.status_display = status
75
97
  self.status_display.start()
76
98
 
77
- def process_line(self, log_line):
99
+ def process_line(self, log_line: str) -> None:
78
100
  if 'Kind cluster created.' in log_line:
79
101
  logger.info(f'{colorama.Fore.GREEN}Kubernetes is running.'
80
102
  f'{colorama.Style.RESET_ALL}')
81
103
  if 'Installing NVIDIA GPU operator...' in log_line:
82
- self.status_display.update('[bold cyan]Creating local cluster - '
83
- 'Installing NVIDIA GPU operator')
104
+ self.status_display.update(
105
+ ux_utils.spinner_message(
106
+ 'Creating local cluster - '
107
+ 'Installing NVIDIA GPU operator',
108
+ log_path=self.log_path,
109
+ is_local=self.is_local))
84
110
  if 'Starting wait for GPU operator installation...' in log_line:
85
111
  self.status_display.update(
86
- '[bold cyan]Creating local cluster - '
87
- 'waiting for NVIDIA GPU operator installation to complete')
112
+ ux_utils.spinner_message(
113
+ 'Creating local cluster - '
114
+ 'waiting for NVIDIA GPU operator installation to complete',
115
+ log_path=self.log_path,
116
+ is_local=self.is_local))
88
117
  logger.info('To check NVIDIA GPU operator status, '
89
118
  'see pods: kubectl get pods -n gpu-operator')
90
119
  if 'GPU operator installed' in log_line:
91
120
  logger.info(f'{colorama.Fore.GREEN}NVIDIA GPU Operator installed.'
92
121
  f'{colorama.Style.RESET_ALL}')
93
122
  if 'Pulling SkyPilot GPU image...' in log_line:
94
- self.status_display.update('[bold cyan]Creating local cluster - '
95
- 'pulling and loading SkyPilot GPU image')
123
+ self.status_display.update(
124
+ ux_utils.spinner_message(
125
+ 'Creating local cluster - '
126
+ 'pulling and loading SkyPilot GPU image',
127
+ log_path=self.log_path,
128
+ is_local=self.is_local))
96
129
  if 'SkyPilot GPU image loaded into kind cluster' in log_line:
97
130
  logger.info(f'{colorama.Fore.GREEN}SkyPilot GPU image pulled.'
98
131
  f'{colorama.Style.RESET_ALL}')
99
132
  if 'Labelling nodes with GPUs...' in log_line:
100
- self.status_display.update('[bold cyan]Creating local cluster - '
101
- 'launching GPU labelling jobs')
133
+ self.status_display.update(
134
+ ux_utils.spinner_message(
135
+ 'Creating local cluster - '
136
+ 'launching GPU labelling jobs',
137
+ log_path=self.log_path,
138
+ is_local=self.is_local))
102
139
  if ('Starting wait for SkyPilot GPU labeling jobs to complete'
103
140
  in log_line):
104
141
  self.status_display.update(
105
- '[bold cyan]Creating local cluster - '
106
- 'waiting for GPU labelling jobs to complete')
142
+ ux_utils.spinner_message(
143
+ 'Creating local cluster - '
144
+ 'waiting for GPU labelling jobs to complete',
145
+ log_path=self.log_path,
146
+ is_local=self.is_local))
107
147
  logger.info(
108
148
  'To check GPU labelling status, see jobs: '
109
149
  'kubectl get jobs -n kube-system -l job=sky-gpu-labeler')
@@ -111,20 +151,136 @@ class SkyLocalUpLineProcessor(LineProcessor):
111
151
  logger.info(f'{colorama.Fore.GREEN}GPU labelling complete.'
112
152
  f'{colorama.Style.RESET_ALL}')
113
153
  if 'Pulling SkyPilot CPU image...' in log_line:
114
- self.status_display.update('[bold cyan]Creating local cluster - '
115
- 'pulling and loading SkyPilot CPU image')
154
+ self.status_display.update(
155
+ ux_utils.spinner_message(
156
+ 'Creating local cluster - '
157
+ 'pulling and loading SkyPilot CPU image',
158
+ log_path=self.log_path,
159
+ is_local=self.is_local))
116
160
  if 'SkyPilot CPU image loaded into kind cluster' in log_line:
117
161
  logger.info(f'{colorama.Fore.GREEN}SkyPilot CPU image pulled.'
118
162
  f'{colorama.Style.RESET_ALL}')
119
163
  if 'Starting installation of Nginx Ingress Controller...' in log_line:
120
164
  self.status_display.update(
121
- '[bold cyan]Creating Nginx Ingress Controller')
165
+ ux_utils.spinner_message(
166
+ 'Creating local cluster - '
167
+ 'creating Nginx Ingress Controller',
168
+ log_path=self.log_path,
169
+ is_local=self.is_local))
122
170
  if 'Nginx Ingress Controller installed' in log_line:
123
171
  logger.info(
124
172
  f'{colorama.Fore.GREEN}Nginx Ingress Controller installed.'
125
173
  f'{colorama.Style.RESET_ALL}')
174
+ self.status_display.update(
175
+ ux_utils.spinner_message('Wrapping up local cluster setup',
176
+ log_path=self.log_path,
177
+ is_local=self.is_local))
126
178
 
127
- def __exit__(self, except_type, except_value, traceback):
179
+ def __exit__(self, except_type: Optional[Type[BaseException]],
180
+ except_value: Optional[BaseException],
181
+ traceback: Optional[types.TracebackType]) -> None:
182
+ del except_type, except_value, traceback # unused
183
+ self.status_display.stop()
184
+
185
+
186
+ class SkyRemoteUpLineProcessor(LineProcessor):
187
+ """A processor for deploy_remote_cluster.sh log lines."""
188
+
189
+ def __init__(self, log_path: str, is_local: bool):
190
+ self.log_path = log_path
191
+ self.is_local = is_local
192
+
193
+ def __enter__(self) -> None:
194
+ # TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
195
+ # messages.
196
+ status = rich_utils.safe_status(
197
+ ux_utils.spinner_message('Creating remote cluster',
198
+ log_path=self.log_path,
199
+ is_local=self.is_local))
200
+ self.status_display = status
201
+ self.status_display.start()
202
+
203
+ def process_line(self, log_line: str) -> None:
204
+ # Pre-flight checks
205
+ if 'SSH connection successful' in log_line:
206
+ logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
207
+ f'{colorama.Style.RESET_ALL}')
208
+
209
+ # Kubernetes installation steps
210
+ if 'Deploying Kubernetes on head node' in log_line:
211
+ self.status_display.update(
212
+ ux_utils.spinner_message(
213
+ 'Creating remote cluster - '
214
+ 'deploying Kubernetes on head node',
215
+ log_path=self.log_path,
216
+ is_local=self.is_local))
217
+ if 'K3s deployed on head node.' in log_line:
218
+ logger.info(f'{colorama.Fore.GREEN}'
219
+ '✔ K3s successfully deployed on head node.'
220
+ f'{colorama.Style.RESET_ALL}')
221
+
222
+ # Worker nodes
223
+ if 'Deploying Kubernetes on worker node' in log_line:
224
+ self.status_display.update(
225
+ ux_utils.spinner_message(
226
+ 'Creating remote cluster - '
227
+ 'deploying Kubernetes on worker nodes',
228
+ log_path=self.log_path,
229
+ is_local=self.is_local))
230
+ if 'Kubernetes deployed on worker node' in log_line:
231
+ logger.info(f'{colorama.Fore.GREEN}'
232
+ '✔ K3s successfully deployed on worker node.'
233
+ f'{colorama.Style.RESET_ALL}')
234
+
235
+ # Cluster configuration
236
+ if 'Configuring local kubectl to connect to the cluster...' in log_line:
237
+ self.status_display.update(
238
+ ux_utils.spinner_message(
239
+ 'Creating remote cluster - '
240
+ 'configuring local kubectl',
241
+ log_path=self.log_path,
242
+ is_local=self.is_local))
243
+ if 'kubectl configured to connect to the cluster.' in log_line:
244
+ logger.info(f'{colorama.Fore.GREEN}'
245
+ '✔ kubectl configured for the remote cluster.'
246
+ f'{colorama.Style.RESET_ALL}')
247
+
248
+ # GPU operator installation
249
+ if 'Installing Nvidia GPU Operator...' in log_line:
250
+ self.status_display.update(
251
+ ux_utils.spinner_message(
252
+ 'Creating remote cluster - '
253
+ 'installing Nvidia GPU Operator',
254
+ log_path=self.log_path,
255
+ is_local=self.is_local))
256
+ if 'GPU Operator installed.' in log_line:
257
+ logger.info(f'{colorama.Fore.GREEN}'
258
+ '✔ Nvidia GPU Operator installed successfully.'
259
+ f'{colorama.Style.RESET_ALL}')
260
+
261
+ # Cleanup steps
262
+ if 'Cleaning up head node' in log_line:
263
+ self.status_display.update(
264
+ ux_utils.spinner_message('Cleaning up head node',
265
+ log_path=self.log_path,
266
+ is_local=self.is_local))
267
+ if 'Cleaning up node' in log_line:
268
+ self.status_display.update(
269
+ ux_utils.spinner_message('Cleaning up worker node',
270
+ log_path=self.log_path,
271
+ is_local=self.is_local))
272
+ if 'cleaned up successfully' in log_line:
273
+ logger.info(f'{colorama.Fore.GREEN}'
274
+ f'{log_line.strip()}{colorama.Style.RESET_ALL}')
275
+
276
+ # Final status
277
+ if 'Cluster deployment completed.' in log_line:
278
+ logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
279
+ f'{colorama.Style.RESET_ALL}')
280
+
281
+ def __exit__(self, except_type: Optional[Type[BaseException]],
282
+ except_value: Optional[BaseException],
283
+ traceback: Optional[types.TracebackType]) -> None:
128
284
  del except_type, except_value, traceback # unused
129
285
  self.status_display.stop()
130
286
 
@@ -157,7 +313,8 @@ def readable_time_duration(start: Optional[float],
157
313
  e.g. "1h 2m 23s"
158
314
  """
159
315
  # start < 0 means that the starting time is not specified yet.
160
- # It is only used in spot_utils.show_jobs() for job duration calculation.
316
+ # It is only used in jobs_utils.format_job_table() for job duration
317
+ # calculation.
161
318
  if start is None or start < 0:
162
319
  return '-'
163
320
  if end == start == 0:
@@ -191,3 +348,53 @@ def readable_time_duration(start: Optional[float],
191
348
  diff = diff.replace('hour', 'hr')
192
349
 
193
350
  return diff
351
+
352
+
353
+ def follow_logs(
354
+ file: TextIO,
355
+ *,
356
+ should_stop: Callable[[], bool],
357
+ stop_on_eof: bool = False,
358
+ process_line: Optional[Callable[[str], Iterator[str]]] = None,
359
+ idle_timeout_seconds: Optional[int] = None,
360
+ ) -> Iterator[str]:
361
+ """Streams and processes logs line by line from a file.
362
+
363
+ Args:
364
+ file: File object to read logs from.
365
+ should_stop: Callback that returns True when streaming should stop.
366
+ stop_on_eof: If True, stop when reaching end of file.
367
+ process_line: Optional callback to transform/filter each line.
368
+ idle_timeout_seconds: If set, stop after these many seconds without
369
+ new content.
370
+
371
+ Yields:
372
+ Log lines, possibly transformed by process_line if provided.
373
+ """
374
+ current_line: str = ''
375
+ seconds_without_content: int = 0
376
+
377
+ while True:
378
+ content = file.readline()
379
+
380
+ if not content:
381
+ if stop_on_eof or should_stop():
382
+ break
383
+
384
+ if idle_timeout_seconds is not None:
385
+ if seconds_without_content >= idle_timeout_seconds:
386
+ break
387
+ seconds_without_content += 1
388
+
389
+ time.sleep(1)
390
+ continue
391
+
392
+ seconds_without_content = 0
393
+ current_line += content
394
+
395
+ if '\n' in current_line or '\r' in current_line:
396
+ if process_line is not None:
397
+ yield from process_line(current_line)
398
+ else:
399
+ yield current_line
400
+ current_line = ''
@@ -0,0 +1,81 @@
1
+ """Utilities for encoding and decoding messages."""
2
+ import json
3
+ import re
4
+ import typing
5
+ from typing import Any, Literal, Optional, Tuple, Union
6
+
7
+ _PAYLOAD_PATTERN = re.compile(r'<sky-payload(.*?)>(.*?)</sky-payload>')
8
+ _PAYLOAD_STR = '<sky-payload{type}>{content}</sky-payload>\n'
9
+
10
+
11
+ def encode_payload(payload: Any, payload_type: Optional[str] = None) -> str:
12
+ """Encode a payload to make it more robust for parsing.
13
+
14
+ This makes message transfer more robust to any additional strings added to
15
+ the message during transfer.
16
+
17
+ An example message that is polluted by the system warning:
18
+ "LC_ALL: cannot change locale (en_US.UTF-8)\n<sky-payload>hello, world</sky-payload>" # pylint: disable=line-too-long
19
+
20
+ Args:
21
+ payload: A str, dict or list to be encoded.
22
+
23
+ Returns:
24
+ A string that is encoded from the payload.
25
+ """
26
+ payload_str = json.dumps(payload)
27
+ if payload_type is None:
28
+ payload_type = ''
29
+ payload_str = _PAYLOAD_STR.format(type=payload_type, content=payload_str)
30
+ return payload_str
31
+
32
+
33
+ @typing.overload
34
+ def decode_payload(payload_str: str,
35
+ payload_type: Optional[str] = None,
36
+ raise_for_mismatch: Literal[True] = True) -> Any:
37
+ ...
38
+
39
+
40
+ @typing.overload
41
+ def decode_payload(
42
+ payload_str: str,
43
+ payload_type: Optional[str] = None,
44
+ raise_for_mismatch: Literal[False] = False) -> Tuple[bool, Any]:
45
+ ...
46
+
47
+
48
+ def decode_payload(
49
+ payload_str: str,
50
+ payload_type: Optional[str] = None,
51
+ raise_for_mismatch: bool = True) -> Union[Tuple[bool, Any], Any]:
52
+ """Decode a payload string.
53
+
54
+ Args:
55
+ payload_str: A string that is encoded from a payload.
56
+ payload_type: The type of the payload.
57
+ raise_for_mismatch: Whether to raise an error if the payload string is
58
+ not valid.
59
+
60
+ Returns:
61
+ A tuple of (bool, Any). The bool indicates whether it is a payload
62
+ string. The Any is the decoded payload, which is a str, dict or list.
63
+ """
64
+ matched = _PAYLOAD_PATTERN.findall(payload_str)
65
+ if not matched:
66
+ if raise_for_mismatch:
67
+ raise ValueError(f'Invalid payload string: \n{payload_str}')
68
+ else:
69
+ return False, payload_str
70
+
71
+ for payload_type_str, payload_str in matched:
72
+ if payload_type is None or payload_type == payload_type_str:
73
+ if raise_for_mismatch:
74
+ return json.loads(payload_str)
75
+ else:
76
+ return True, json.loads(payload_str)
77
+
78
+ if raise_for_mismatch:
79
+ raise ValueError(f'Invalid payload string: \n{payload_str}')
80
+ else:
81
+ return False, payload_str
sky/utils/registry.py ADDED
@@ -0,0 +1,127 @@
1
+ """Registry for classes to be discovered"""
2
+
3
+ import typing
4
+ from typing import Callable, Dict, List, Optional, Set, Type, Union
5
+
6
+ from sky.utils import ux_utils
7
+
8
+ if typing.TYPE_CHECKING:
9
+ from sky.backends import backend
10
+ from sky.clouds import cloud
11
+ from sky.jobs import recovery_strategy
12
+
13
+ T = typing.TypeVar('T')
14
+
15
+
16
+ class _Registry(dict, typing.Generic[T]):
17
+ """Registry."""
18
+
19
+ def __init__(self,
20
+ registry_name: str,
21
+ exclude: Optional[Set[str]],
22
+ type_register: bool = False):
23
+ super().__init__()
24
+ self._registry_name = registry_name
25
+ self._exclude = exclude or set()
26
+ self._default: Optional[str] = None
27
+ self._type_register: bool = type_register
28
+ self._aliases: Dict[str, str] = {}
29
+
30
+ def from_str(self, name: Optional[str]) -> Optional[T]:
31
+ """Returns the cloud instance from the canonical name or alias."""
32
+ if name is None:
33
+ return None
34
+
35
+ search_name = name.lower()
36
+ if search_name in self._exclude:
37
+ return None
38
+
39
+ if search_name in self:
40
+ return self[search_name]
41
+
42
+ if search_name in self._aliases:
43
+ return self[self._aliases[search_name]]
44
+
45
+ with ux_utils.print_exception_no_traceback():
46
+ raise ValueError(
47
+ f'{self._registry_name.capitalize()} {name!r} is not a '
48
+ f'valid {self._registry_name} among '
49
+ f'{[*self.keys(), *self._aliases.keys()]}')
50
+
51
+ def type_register(self,
52
+ name: str,
53
+ default: bool = False) -> Callable[[Type[T]], Type[T]]:
54
+
55
+ name = name.lower()
56
+
57
+ def decorator(cls: Type[T]) -> Type[T]:
58
+ assert self._type_register, ('type_register can only be used '
59
+ 'when type_register is True')
60
+ assert name not in self, f'{name} already registered'
61
+ self[name] = cls
62
+ if default:
63
+ self._default = name
64
+ return cls
65
+
66
+ return decorator
67
+
68
+ @typing.overload
69
+ def register(self, cls: Type[T]) -> Type[T]:
70
+ ...
71
+
72
+ @typing.overload
73
+ def register(
74
+ self,
75
+ cls: None = None,
76
+ aliases: Optional[List[str]] = None
77
+ ) -> Callable[[Type[T]], Type[T]]:
78
+ ...
79
+
80
+ def register(
81
+ self,
82
+ cls: Optional[Type[T]] = None,
83
+ aliases: Optional[List[str]] = None
84
+ ) -> Union[Type[T], Callable[[Type[T]], Type[T]]]:
85
+ assert not self._type_register, ('register can only be used when '
86
+ 'type_register is False')
87
+
88
+ def _register(cls: Type[T]) -> Type[T]:
89
+ name = cls.__name__.lower()
90
+ assert name not in self, f'{name} already registered'
91
+ self[name] = cls()
92
+
93
+ for alias in aliases or []:
94
+ alias = alias.lower()
95
+ assert alias not in self._aliases, f'{alias} already registered'
96
+ self._aliases[alias] = name
97
+ return cls
98
+
99
+ if cls is not None:
100
+ # Invocation without parentheses (e.g. @register)
101
+ return _register(cls)
102
+
103
+ # Invocation with parentheses (e.g. @register(aliases=['alias']))
104
+ return _register
105
+
106
+ @property
107
+ def default(self) -> str:
108
+ assert self._default is not None, ('default is not set', self)
109
+ return self._default
110
+
111
+
112
+ # Backward compatibility. global_user_state's DB may have recorded
113
+ # Local cloud, and we've just removed it from the registry, and
114
+ # global_user_state.get_enabled_clouds() would call into this func
115
+ # and fail.
116
+
117
+ CLOUD_REGISTRY: _Registry = _Registry['cloud.Cloud'](registry_name='cloud',
118
+ exclude={'local'})
119
+
120
+ BACKEND_REGISTRY: _Registry = _Registry['backend.Backend'](
121
+ registry_name='backend', type_register=True, exclude=None)
122
+
123
+ JOBS_RECOVERY_STRATEGY_REGISTRY: _Registry = (
124
+ _Registry['recovery_strategy.StrategyExecutor'](
125
+ registry_name='jobs recovery strategy',
126
+ exclude=None,
127
+ type_register=True))