skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py ADDED
@@ -0,0 +1,302 @@
1
+ """SDK functions for managed jobs."""
2
+ import json
3
+ import typing
4
+ from typing import Dict, List, Optional, Union
5
+ import webbrowser
6
+
7
+ import click
8
+ import requests
9
+
10
+ from sky import sky_logging
11
+ from sky.client import common as client_common
12
+ from sky.client import sdk
13
+ from sky.server import common as server_common
14
+ from sky.server.requests import payloads
15
+ from sky.skylet import constants
16
+ from sky.usage import usage_lib
17
+ from sky.utils import common_utils
18
+ from sky.utils import dag_utils
19
+
20
+ if typing.TYPE_CHECKING:
21
+ import io
22
+
23
+ import sky
24
+
25
+ logger = sky_logging.init_logger(__name__)
26
+
27
+
28
+ @usage_lib.entrypoint
29
+ @server_common.check_server_healthy_or_start
30
+ def launch(
31
+ task: Union['sky.Task', 'sky.Dag'],
32
+ name: Optional[str] = None,
33
+ # Internal only:
34
+ # pylint: disable=invalid-name
35
+ _need_confirmation: bool = False,
36
+ ) -> server_common.RequestId:
37
+ """Launches a managed job.
38
+
39
+ Please refer to sky.cli.job_launch for documentation.
40
+
41
+ Args:
42
+ task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
43
+ managed job.
44
+ name: Name of the managed job.
45
+ _need_confirmation: (Internal only) Whether to show a confirmation
46
+ prompt before launching the job.
47
+
48
+ Returns:
49
+ The request ID of the launch request.
50
+
51
+ Request Returns:
52
+ job_id (Optional[int]): Job ID for the managed job
53
+ controller_handle (Optional[ResourceHandle]): ResourceHandle of the
54
+ controller
55
+
56
+ Request Raises:
57
+ ValueError: cluster does not exist. Or, the entrypoint is not a valid
58
+ chain dag.
59
+ sky.exceptions.NotSupportedError: the feature is not supported.
60
+ """
61
+
62
+ dag = dag_utils.convert_entrypoint_to_dag(task)
63
+ sdk.validate(dag)
64
+ if _need_confirmation:
65
+ request_id = sdk.optimize(dag)
66
+ sdk.stream_and_get(request_id)
67
+ prompt = f'Launching a managed job {dag.name!r}. Proceed?'
68
+ if prompt is not None:
69
+ click.confirm(prompt, default=True, abort=True, show_default=True)
70
+
71
+ dag = client_common.upload_mounts_to_api_server(dag)
72
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
73
+ body = payloads.JobsLaunchBody(
74
+ task=dag_str,
75
+ name=name,
76
+ )
77
+ response = requests.post(
78
+ f'{server_common.get_server_url()}/jobs/launch',
79
+ json=json.loads(body.model_dump_json()),
80
+ timeout=(5, None),
81
+ )
82
+ return server_common.get_request_id(response)
83
+
84
+
85
+ @usage_lib.entrypoint
86
+ @server_common.check_server_healthy_or_start
87
+ def queue(refresh: bool,
88
+ skip_finished: bool = False,
89
+ all_users: bool = False) -> server_common.RequestId:
90
+ """Gets statuses of managed jobs.
91
+
92
+ Please refer to sky.cli.job_queue for documentation.
93
+
94
+ Args:
95
+ refresh: Whether to restart the jobs controller if it is stopped.
96
+ skip_finished: Whether to skip finished jobs.
97
+ all_users: Whether to show all users' jobs.
98
+
99
+ Returns:
100
+ The request ID of the queue request.
101
+
102
+ Request Returns:
103
+ job_records (List[Dict[str, Any]]): A list of dicts, with each dict
104
+ containing the information of a job.
105
+
106
+ .. code-block:: python
107
+
108
+ [
109
+ {
110
+ 'job_id': (int) job id,
111
+ 'job_name': (str) job name,
112
+ 'resources': (str) resources of the job,
113
+ 'submitted_at': (float) timestamp of submission,
114
+ 'end_at': (float) timestamp of end,
115
+ 'duration': (float) duration in seconds,
116
+ 'recovery_count': (int) Number of retries,
117
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
118
+ 'cluster_resources': (str) resources of the cluster,
119
+ 'region': (str) region of the cluster,
120
+ }
121
+ ]
122
+
123
+ Request Raises:
124
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up or
125
+ does not exist.
126
+ RuntimeError: if failed to get the managed jobs with ssh.
127
+ """
128
+ body = payloads.JobsQueueBody(
129
+ refresh=refresh,
130
+ skip_finished=skip_finished,
131
+ all_users=all_users,
132
+ )
133
+ response = requests.post(
134
+ f'{server_common.get_server_url()}/jobs/queue',
135
+ json=json.loads(body.model_dump_json()),
136
+ timeout=(5, None),
137
+ )
138
+ return server_common.get_request_id(response=response)
139
+
140
+
141
+ @usage_lib.entrypoint
142
+ @server_common.check_server_healthy_or_start
143
+ def cancel(
144
+ name: Optional[str] = None,
145
+ job_ids: Optional[List[int]] = None,
146
+ all: bool = False, # pylint: disable=redefined-builtin
147
+ all_users: bool = False,
148
+ ) -> server_common.RequestId:
149
+ """Cancels managed jobs.
150
+
151
+ Please refer to sky.cli.job_cancel for documentation.
152
+
153
+ Args:
154
+ name: Name of the managed job to cancel.
155
+ job_ids: IDs of the managed jobs to cancel.
156
+ all: Whether to cancel all managed jobs.
157
+ all_users: Whether to cancel all managed jobs from all users.
158
+
159
+ Returns:
160
+ The request ID of the cancel request.
161
+
162
+ Request Raises:
163
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up.
164
+ RuntimeError: failed to cancel the job.
165
+ """
166
+ body = payloads.JobsCancelBody(
167
+ name=name,
168
+ job_ids=job_ids,
169
+ all=all,
170
+ all_users=all_users,
171
+ )
172
+ response = requests.post(
173
+ f'{server_common.get_server_url()}/jobs/cancel',
174
+ json=json.loads(body.model_dump_json()),
175
+ timeout=(5, None),
176
+ )
177
+ return server_common.get_request_id(response=response)
178
+
179
+
180
+ @usage_lib.entrypoint
181
+ @server_common.check_server_healthy_or_start
182
+ def tail_logs(name: Optional[str] = None,
183
+ job_id: Optional[int] = None,
184
+ follow: bool = True,
185
+ controller: bool = False,
186
+ refresh: bool = False,
187
+ output_stream: Optional['io.TextIOBase'] = None) -> None:
188
+ """Tails logs of managed jobs.
189
+
190
+ You can provide either a job name or a job ID to tail logs. If both are not
191
+ provided, the logs of the latest job will be shown.
192
+
193
+ Args:
194
+ name: Name of the managed job to tail logs.
195
+ job_id: ID of the managed job to tail logs.
196
+ follow: Whether to follow the logs.
197
+ controller: Whether to tail logs from the jobs controller.
198
+ refresh: Whether to restart the jobs controller if it is stopped.
199
+ output_stream: The stream to write the logs to. If None, print to the
200
+ console.
201
+
202
+ Request Raises:
203
+ ValueError: invalid arguments.
204
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up.
205
+ """
206
+ body = payloads.JobsLogsBody(
207
+ name=name,
208
+ job_id=job_id,
209
+ follow=follow,
210
+ controller=controller,
211
+ refresh=refresh,
212
+ )
213
+ response = requests.post(
214
+ f'{server_common.get_server_url()}/jobs/logs',
215
+ json=json.loads(body.model_dump_json()),
216
+ stream=True,
217
+ timeout=(5, None),
218
+ )
219
+ request_id = server_common.get_request_id(response)
220
+ sdk.stream_response(request_id, response, output_stream)
221
+
222
+
223
+ @usage_lib.entrypoint
224
+ @server_common.check_server_healthy_or_start
225
+ def download_logs(
226
+ name: Optional[str],
227
+ job_id: Optional[int],
228
+ refresh: bool,
229
+ controller: bool,
230
+ local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[int, str]:
231
+ """Sync down logs of managed jobs.
232
+
233
+ Please refer to sky.cli.job_logs for documentation.
234
+
235
+ Args:
236
+ name: Name of the managed job to sync down logs.
237
+ job_id: ID of the managed job to sync down logs.
238
+ refresh: Whether to restart the jobs controller if it is stopped.
239
+ controller: Whether to sync down logs from the jobs controller.
240
+ local_dir: Local directory to sync down logs.
241
+
242
+ Returns:
243
+ A dictionary mapping job ID to the local path.
244
+
245
+ Request Raises:
246
+ ValueError: invalid arguments.
247
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up.
248
+ """
249
+
250
+ body = payloads.JobsDownloadLogsBody(
251
+ name=name,
252
+ job_id=job_id,
253
+ refresh=refresh,
254
+ controller=controller,
255
+ local_dir=local_dir,
256
+ )
257
+ response = requests.post(
258
+ f'{server_common.get_server_url()}/jobs/download_logs',
259
+ json=json.loads(body.model_dump_json()),
260
+ timeout=(5, None),
261
+ )
262
+ job_id_remote_path_dict = sdk.stream_and_get(
263
+ server_common.get_request_id(response))
264
+ remote2local_path_dict = client_common.download_logs_from_api_server(
265
+ job_id_remote_path_dict.values())
266
+ return {
267
+ job_id: remote2local_path_dict[remote_path]
268
+ for job_id, remote_path in job_id_remote_path_dict.items()
269
+ }
270
+
271
+
272
+ spot_launch = common_utils.deprecated_function(
273
+ launch,
274
+ name='sky.jobs.launch',
275
+ deprecated_name='spot_launch',
276
+ removing_version='0.8.0',
277
+ override_argument={'use_spot': True})
278
+ spot_queue = common_utils.deprecated_function(queue,
279
+ name='sky.jobs.queue',
280
+ deprecated_name='spot_queue',
281
+ removing_version='0.8.0')
282
+ spot_cancel = common_utils.deprecated_function(cancel,
283
+ name='sky.jobs.cancel',
284
+ deprecated_name='spot_cancel',
285
+ removing_version='0.8.0')
286
+ spot_tail_logs = common_utils.deprecated_function(
287
+ tail_logs,
288
+ name='sky.jobs.tail_logs',
289
+ deprecated_name='spot_tail_logs',
290
+ removing_version='0.8.0')
291
+
292
+
293
+ @usage_lib.entrypoint
294
+ @server_common.check_server_healthy_or_start
295
+ def dashboard() -> None:
296
+ """Starts a dashboard for managed jobs."""
297
+ user_hash = common_utils.get_user_hash()
298
+ api_server_url = server_common.get_server_url()
299
+ params = f'user_hash={user_hash}'
300
+ url = f'{api_server_url}/jobs/dashboard?{params}'
301
+ logger.info(f'Opening dashboard in browser: {url}')
302
+ webbrowser.open(url)
sky/jobs/constants.py CHANGED
@@ -1,27 +1,65 @@
1
1
  """Constants used for Managed Jobs."""
2
+ from typing import Dict, Union
3
+
4
+ from sky.skylet import constants as skylet_constants
2
5
 
3
6
  JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
4
7
  JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
8
+ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
5
9
 
6
10
  JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
7
11
 
8
12
  # Resources as a dict for the jobs controller.
9
- # Use default CPU instance type for jobs controller with >= 24GB, i.e.
10
- # m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
11
- # for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP, etc.
12
- # Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
13
- # OOM (each vCPU can have 4 jobs controller processes as we set the CPU
14
- # requirement to 0.25, and 3 GB is barely enough for 4 job processes).
13
+ # Use smaller CPU instance type for jobs controller, but with more memory, i.e.
14
+ # r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
15
+ # and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
16
+ # Concurrently limits are set based on profiling. 4x num vCPUs is the launch
17
+ # parallelism limit, and memory / 350MB is the limit to concurrently running
18
+ # jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
15
19
  # We use 50 GB disk size to reduce the cost.
16
- CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
20
+ CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
21
+ 'cpus': '4+',
22
+ 'memory': '8x',
23
+ 'disk_size': 50
24
+ }
17
25
 
26
+ # TODO(zhwu): This is no longer accurate, after #4592, which increases the
27
+ # length of user hash appended to the cluster name from 4 to 8 chars. This makes
28
+ # the cluster name on GCP being wrapped twice. However, we cannot directly
29
+ # update this constant, because the job cluster cleanup and many other logic
30
+ # in managed jobs depends on this constant, i.e., updating this constant will
31
+ # break backward compatibility and existing jobs.
32
+ #
18
33
  # Max length of the cluster name for GCP is 35, the user hash to be attached is
19
- # 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
20
- # length of the cluster name prefix is 25 to avoid the cluster name being too
21
- # long and truncated twice during the cluster creation.
34
+ # 4(now 8)+1 chars, and we assume the maximum length of the job id is
35
+ # 4(now 8)+1, so the max length of the cluster name prefix is 25(should be 21
36
+ # now) to avoid the cluster name being too long and truncated twice during the
37
+ # cluster creation.
22
38
  JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
23
39
 
24
40
  # The version of the lib files that jobs/utils use. Whenever there is an API
25
41
  # change for the jobs/utils, we need to bump this version and update
26
42
  # job.utils.ManagedJobCodeGen to handle the version update.
27
- MANAGED_JOBS_VERSION = 1
43
+ MANAGED_JOBS_VERSION = 2
44
+
45
+ # The command for setting up the jobs dashboard on the controller. It firstly
46
+ # checks if the systemd services are available, and if not (e.g., Kubernetes
47
+ # containers may not have systemd), it starts the dashboard manually.
48
+ DASHBOARD_SETUP_CMD = (
49
+ 'if command -v systemctl &>/dev/null && systemctl --user show &>/dev/null; '
50
+ 'then '
51
+ ' systemctl --user daemon-reload; '
52
+ ' systemctl --user enable --now skypilot-dashboard; '
53
+ 'else '
54
+ ' echo "Systemd services not found. Starting SkyPilot dashboard '
55
+ 'manually."; '
56
+ # Kill any old dashboard processes;
57
+ ' ps aux | grep -v nohup | grep -v grep | '
58
+ ' grep -- \'-m sky.jobs.dashboard.dashboard\' | awk \'{print $2}\' | '
59
+ ' xargs kill > /dev/null 2>&1 || true;'
60
+ # Launch the dashboard in the background if not already running
61
+ ' (ps aux | grep -v nohup | grep -v grep | '
62
+ ' grep -q -- \'-m sky.jobs.dashboard.dashboard\') || '
63
+ f'(nohup {skylet_constants.SKY_PYTHON_CMD} -m sky.jobs.dashboard.dashboard '
64
+ '>> ~/.sky/job-dashboard.log 2>&1 &); '
65
+ 'fi')