skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/core.py DELETED
@@ -1,330 +0,0 @@
1
- """SDK functions for managed jobs."""
2
- import os
3
- import tempfile
4
- from typing import Any, Dict, List, Optional, Union
5
- import uuid
6
-
7
- import colorama
8
-
9
- import sky
10
- from sky import backends
11
- from sky import exceptions
12
- from sky import sky_logging
13
- from sky import status_lib
14
- from sky import task as task_lib
15
- from sky.backends import backend_utils
16
- from sky.clouds.service_catalog import common as service_catalog_common
17
- from sky.jobs import constants as managed_job_constants
18
- from sky.jobs import utils as managed_job_utils
19
- from sky.skylet import constants as skylet_constants
20
- from sky.usage import usage_lib
21
- from sky.utils import common_utils
22
- from sky.utils import controller_utils
23
- from sky.utils import dag_utils
24
- from sky.utils import rich_utils
25
- from sky.utils import subprocess_utils
26
- from sky.utils import ux_utils
27
-
28
-
29
- @usage_lib.entrypoint
30
- def launch(
31
- task: Union['sky.Task', 'sky.Dag'],
32
- name: Optional[str] = None,
33
- stream_logs: bool = True,
34
- detach_run: bool = False,
35
- retry_until_up: bool = False,
36
- ) -> None:
37
- # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
38
- """Launch a managed job.
39
-
40
- Please refer to sky.cli.job_launch for documentation.
41
-
42
- Args:
43
- task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
44
- managed job.
45
- name: Name of the managed job.
46
- detach_run: Whether to detach the run.
47
-
48
- Raises:
49
- ValueError: cluster does not exist. Or, the entrypoint is not a valid
50
- chain dag.
51
- sky.exceptions.NotSupportedError: the feature is not supported.
52
- """
53
- entrypoint = task
54
- dag_uuid = str(uuid.uuid4().hex[:4])
55
-
56
- dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
57
- if not dag.is_chain():
58
- with ux_utils.print_exception_no_traceback():
59
- raise ValueError('Only single-task or chain DAG is '
60
- f'allowed for job_launch. Dag: {dag}')
61
-
62
- dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
63
-
64
- task_names = set()
65
- for task_ in dag.tasks:
66
- if task_.name in task_names:
67
- with ux_utils.print_exception_no_traceback():
68
- raise ValueError(
69
- f'Task name {task_.name!r} is duplicated in the DAG. '
70
- 'Either change task names to be unique, or specify the DAG '
71
- 'name only and comment out the task names (so that they '
72
- 'will be auto-generated) .')
73
- task_names.add(task_.name)
74
-
75
- dag_utils.fill_default_config_in_dag_for_job_launch(dag)
76
-
77
- for task_ in dag.tasks:
78
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
79
- task_, path='jobs')
80
-
81
- with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
82
- mode='w') as f:
83
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
84
- controller = controller_utils.Controllers.JOBS_CONTROLLER
85
- controller_name = controller.value.cluster_name
86
- prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
87
- remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml'
88
- remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml'
89
- controller_resources = controller_utils.get_controller_resources(
90
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
91
- task_resources=sum([list(t.resources) for t in dag.tasks], []))
92
-
93
- vars_to_fill = {
94
- 'remote_user_yaml_path': remote_user_yaml_path,
95
- 'user_yaml_path': f.name,
96
- 'jobs_controller': controller_name,
97
- # Note: actual cluster name will be <task.name>-<managed job ID>
98
- 'dag_name': dag.name,
99
- 'retry_until_up': retry_until_up,
100
- 'remote_user_config_path': remote_user_config_path,
101
- 'modified_catalogs':
102
- service_catalog_common.get_modified_catalog_file_mounts(),
103
- **controller_utils.shared_controller_vars_to_fill(
104
- controller_utils.Controllers.JOBS_CONTROLLER,
105
- remote_user_config_path=remote_user_config_path,
106
- ),
107
- }
108
-
109
- yaml_path = os.path.join(
110
- managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
111
- f'{name}-{dag_uuid}.yaml')
112
- common_utils.fill_template(
113
- managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
114
- vars_to_fill,
115
- output_path=yaml_path)
116
- controller_task = task_lib.Task.from_yaml(yaml_path)
117
- controller_task.set_resources(controller_resources)
118
-
119
- controller_task.managed_job_dag = dag
120
- assert len(controller_task.resources) == 1, controller_task
121
-
122
- sky_logging.print(
123
- f'{colorama.Fore.YELLOW}'
124
- f'Launching managed job {dag.name!r} from jobs controller...'
125
- f'{colorama.Style.RESET_ALL}')
126
- sky_logging.print('Launching jobs controller...')
127
- sky.launch(task=controller_task,
128
- stream_logs=stream_logs,
129
- cluster_name=controller_name,
130
- detach_run=detach_run,
131
- idle_minutes_to_autostop=skylet_constants.
132
- CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
133
- retry_until_up=True,
134
- _disable_controller_check=True)
135
-
136
-
137
- @usage_lib.entrypoint
138
- def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
139
- # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
140
- """Get statuses of managed jobs.
141
-
142
- Please refer to sky.cli.job_queue for documentation.
143
-
144
- Returns:
145
- [
146
- {
147
- 'job_id': int,
148
- 'job_name': str,
149
- 'resources': str,
150
- 'submitted_at': (float) timestamp of submission,
151
- 'end_at': (float) timestamp of end,
152
- 'duration': (float) duration in seconds,
153
- 'recovery_count': (int) Number of retries,
154
- 'status': (sky.jobs.ManagedJobStatus) of the job,
155
- 'cluster_resources': (str) resources of the cluster,
156
- 'region': (str) region of the cluster,
157
- }
158
- ]
159
- Raises:
160
- sky.exceptions.ClusterNotUpError: the jobs controller is not up or
161
- does not exist.
162
- RuntimeError: if failed to get the managed jobs with ssh.
163
- """
164
- jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
165
- stopped_message = ''
166
- if not refresh:
167
- stopped_message = 'No in-progress managed jobs.'
168
- try:
169
- handle = backend_utils.is_controller_accessible(
170
- controller=jobs_controller_type, stopped_message=stopped_message)
171
- except exceptions.ClusterNotUpError as e:
172
- if not refresh:
173
- raise
174
- handle = None
175
- controller_status = e.cluster_status
176
-
177
- if refresh and handle is None:
178
- sky_logging.print(f'{colorama.Fore.YELLOW}'
179
- 'Restarting controller for latest status...'
180
- f'{colorama.Style.RESET_ALL}')
181
-
182
- rich_utils.force_update_status(
183
- '[cyan] Checking managed jobs - restarting '
184
- 'controller[/]')
185
- handle = sky.start(jobs_controller_type.value.cluster_name)
186
- controller_status = status_lib.ClusterStatus.UP
187
- rich_utils.force_update_status('[cyan] Checking managed jobs[/]')
188
-
189
- assert handle is not None, (controller_status, refresh)
190
-
191
- backend = backend_utils.get_backend_from_handle(handle)
192
- assert isinstance(backend, backends.CloudVmRayBackend)
193
-
194
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
195
- returncode, job_table_payload, stderr = backend.run_on_head(
196
- handle,
197
- code,
198
- require_outputs=True,
199
- stream_logs=False,
200
- separate_stderr=True)
201
-
202
- try:
203
- subprocess_utils.handle_returncode(returncode,
204
- code,
205
- 'Failed to fetch managed jobs',
206
- job_table_payload + stderr,
207
- stream_logs=False)
208
- except exceptions.CommandError as e:
209
- raise RuntimeError(str(e)) from e
210
-
211
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
212
- if skip_finished:
213
- # Filter out the finished jobs. If a multi-task job is partially
214
- # finished, we will include all its tasks.
215
- non_finished_tasks = list(
216
- filter(lambda job: not job['status'].is_terminal(), jobs))
217
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
218
- jobs = list(
219
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
220
- return jobs
221
-
222
-
223
- @usage_lib.entrypoint
224
- # pylint: disable=redefined-builtin
225
- def cancel(name: Optional[str] = None,
226
- job_ids: Optional[List[int]] = None,
227
- all: bool = False) -> None:
228
- # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
229
- """Cancel managed jobs.
230
-
231
- Please refer to sky.cli.job_cancel for documentation.
232
-
233
- Raises:
234
- sky.exceptions.ClusterNotUpError: the jobs controller is not up.
235
- RuntimeError: failed to cancel the job.
236
- """
237
- job_ids = [] if job_ids is None else job_ids
238
- handle = backend_utils.is_controller_accessible(
239
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
240
- stopped_message='All managed jobs should have finished.')
241
-
242
- job_id_str = ','.join(map(str, job_ids))
243
- if sum([len(job_ids) > 0, name is not None, all]) != 1:
244
- argument_str = f'job_ids={job_id_str}' if len(job_ids) > 0 else ''
245
- argument_str += f' name={name}' if name is not None else ''
246
- argument_str += ' all' if all else ''
247
- with ux_utils.print_exception_no_traceback():
248
- raise ValueError('Can only specify one of JOB_IDS or name or all. '
249
- f'Provided {argument_str!r}.')
250
-
251
- backend = backend_utils.get_backend_from_handle(handle)
252
- assert isinstance(backend, backends.CloudVmRayBackend)
253
- if all:
254
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
255
- elif job_ids:
256
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(job_ids)
257
- else:
258
- assert name is not None, (job_ids, name, all)
259
- code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
260
- # The stderr is redirected to stdout
261
- returncode, stdout, _ = backend.run_on_head(handle,
262
- code,
263
- require_outputs=True,
264
- stream_logs=False)
265
- try:
266
- subprocess_utils.handle_returncode(returncode, code,
267
- 'Failed to cancel managed job',
268
- stdout)
269
- except exceptions.CommandError as e:
270
- with ux_utils.print_exception_no_traceback():
271
- raise RuntimeError(e.error_msg) from e
272
-
273
- sky_logging.print(stdout)
274
- if 'Multiple jobs found with name' in stdout:
275
- with ux_utils.print_exception_no_traceback():
276
- raise RuntimeError(
277
- 'Please specify the job ID instead of the job name.')
278
-
279
-
280
- @usage_lib.entrypoint
281
- def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
282
- controller: bool) -> None:
283
- # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
284
- """Tail logs of managed jobs.
285
-
286
- Please refer to sky.cli.job_logs for documentation.
287
-
288
- Raises:
289
- ValueError: invalid arguments.
290
- sky.exceptions.ClusterNotUpError: the jobs controller is not up.
291
- """
292
- # TODO(zhwu): Automatically restart the jobs controller
293
- jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
294
- handle = backend_utils.is_controller_accessible(
295
- controller=jobs_controller_type,
296
- stopped_message=(
297
- 'Please restart the jobs controller with '
298
- f'`sky start {jobs_controller_type.value.cluster_name}`.'))
299
-
300
- if name is not None and job_id is not None:
301
- raise ValueError('Cannot specify both name and job_id.')
302
- backend = backend_utils.get_backend_from_handle(handle)
303
- assert isinstance(backend, backends.CloudVmRayBackend), backend
304
-
305
- backend.tail_managed_job_logs(handle,
306
- job_id=job_id,
307
- job_name=name,
308
- follow=follow,
309
- controller=controller)
310
-
311
-
312
- spot_launch = common_utils.deprecated_function(
313
- launch,
314
- name='sky.jobs.launch',
315
- deprecated_name='spot_launch',
316
- removing_version='0.8.0',
317
- override_argument={'use_spot': True})
318
- spot_queue = common_utils.deprecated_function(queue,
319
- name='sky.jobs.queue',
320
- deprecated_name='spot_queue',
321
- removing_version='0.8.0')
322
- spot_cancel = common_utils.deprecated_function(cancel,
323
- name='sky.jobs.cancel',
324
- deprecated_name='spot_cancel',
325
- removing_version='0.8.0')
326
- spot_tail_logs = common_utils.deprecated_function(
327
- tail_logs,
328
- name='sky.jobs.tail_logs',
329
- deprecated_name='spot_tail_logs',
330
- removing_version='0.8.0')
@@ -1,2 +0,0 @@
1
- """Azure node provider"""
2
- from sky.skylet.providers.azure.node_provider import AzureNodeProvider
@@ -1,301 +0,0 @@
1
- {
2
- "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
3
- "contentVersion": "1.0.0.0",
4
- "parameters": {
5
- "vmName": {
6
- "type": "string",
7
- "metadata": {
8
- "description": "The name of you Virtual Machine."
9
- }
10
- },
11
- "adminUsername": {
12
- "type": "string",
13
- "metadata": {
14
- "description": "Username for the Virtual Machine."
15
- }
16
- },
17
- "publicKey": {
18
- "type": "securestring",
19
- "metadata": {
20
- "description": "SSH Key for the Virtual Machine"
21
- }
22
- },
23
- "imagePublisher": {
24
- "type": "string",
25
- "metadata": {
26
- "description": "The publisher of the VM image"
27
- }
28
- },
29
- "imageOffer": {
30
- "type": "string",
31
- "metadata": {
32
- "description": "The offer of the VM image"
33
- }
34
- },
35
- "imageSku": {
36
- "type": "string",
37
- "metadata": {
38
- "description": "The sku of the VM image"
39
- }
40
- },
41
- "imageVersion": {
42
- "type": "string",
43
- "metadata": {
44
- "description": "The version of the VM image"
45
- }
46
- },
47
- "vmSize": {
48
- "type": "string",
49
- "metadata": {
50
- "description": "The size of the VM"
51
- }
52
- },
53
- "vmTags": {
54
- "type": "object",
55
- "metadata": {
56
- "description": "Tags for the VM"
57
- }
58
- },
59
- "vmCount": {
60
- "type": "int",
61
- "metadata": {
62
- "description": "Number of VMs to deploy"
63
- }
64
- },
65
- "provisionPublicIp": {
66
- "type": "bool",
67
- "defaultValue": true,
68
- "metadata": {
69
- "description": "If true creates a public ip"
70
- }
71
- },
72
- "priority": {
73
- "type": "string",
74
- "defaultValue": "Regular",
75
- "metadata": {
76
- "description": "Specifies the priority for the virtual machine."
77
- }
78
- },
79
- "billingProfile": {
80
- "type": "object",
81
- "defaultValue": {},
82
- "metadata": {
83
- "description": "Specifies the maximum price to pay for Azure Spot VM."
84
- }
85
- },
86
- "osDiskSizeGB": {
87
- "type": "int",
88
- "metadata": {
89
- "description": "OS disk size in GBs."
90
- }
91
- },
92
- "msi": {
93
- "type": "string",
94
- "metadata": {
95
- "description": "Managed service identity resource id."
96
- }
97
- },
98
- "nsg": {
99
- "type": "string",
100
- "metadata": {
101
- "description": "Network security group resource id."
102
- }
103
- },
104
- "subnet": {
105
- "type": "string",
106
- "metadata": {
107
- "descriptions": "Subnet resource id."
108
- }
109
- },
110
- "osDiskTier": {
111
- "type": "string",
112
- "allowedValues": [
113
- "Premium_LRS",
114
- "StandardSSD_LRS",
115
- "Standard_LRS"
116
- ],
117
- "metadata": {
118
- "description": "OS disk tier."
119
- }
120
- },
121
- "cloudInitSetupCommands": {
122
- "type": "string",
123
- "metadata": {
124
- "description": "Base64 encoded cloud-init setup commands."
125
- }
126
- }
127
- },
128
- "variables": {
129
- "location": "[resourceGroup().location]",
130
- "networkInterfaceNamePrivate": "[concat(parameters('vmName'), '-nic')]",
131
- "networkInterfaceNamePublic": "[concat(parameters('vmName'), '-nic-public')]",
132
- "networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
133
- "networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
134
- "publicIpAddressName": "[concat(parameters('vmName'), '-ip')]"
135
- },
136
- "resources": [
137
- {
138
- "type": "Microsoft.Network/networkInterfaces",
139
- "apiVersion": "2020-06-01",
140
- "name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
141
- "location": "[variables('location')]",
142
- "dependsOn": [
143
- "[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
144
- ],
145
- "copy": {
146
- "name": "NICPublicCopy",
147
- "count": "[parameters('vmCount')]"
148
- },
149
- "properties": {
150
- "ipConfigurations": [
151
- {
152
- "name": "[variables('networkIpConfig')]",
153
- "properties": {
154
- "subnet": {
155
- "id": "[parameters('subnet')]"
156
- },
157
- "privateIPAllocationMethod": "Dynamic",
158
- "publicIpAddress": {
159
- "id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
160
- }
161
- }
162
- }
163
- ],
164
- "networkSecurityGroup": {
165
- "id": "[parameters('nsg')]"
166
- }
167
- },
168
- "condition": "[parameters('provisionPublicIp')]"
169
- },
170
- {
171
- "type": "Microsoft.Network/networkInterfaces",
172
- "apiVersion": "2020-06-01",
173
- "name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
174
- "location": "[variables('location')]",
175
- "copy": {
176
- "name": "NICPrivateCopy",
177
- "count": "[parameters('vmCount')]"
178
- },
179
- "properties": {
180
- "ipConfigurations": [
181
- {
182
- "name": "[variables('networkIpConfig')]",
183
- "properties": {
184
- "subnet": {
185
- "id": "[parameters('subnet')]"
186
- },
187
- "privateIPAllocationMethod": "Dynamic"
188
- }
189
- }
190
- ],
191
- "networkSecurityGroup": {
192
- "id": "[parameters('nsg')]"
193
- }
194
- },
195
- "condition": "[not(parameters('provisionPublicIp'))]"
196
- },
197
- {
198
- "type": "Microsoft.Network/publicIpAddresses",
199
- "apiVersion": "2019-02-01",
200
- "name": "[concat(variables('publicIpAddressName'), copyIndex())]",
201
- "location": "[variables('location')]",
202
- "properties": {
203
- "publicIpAllocationMethod": "Static",
204
- "publicIPAddressVersion": "IPv4"
205
- },
206
- "copy": {
207
- "name": "PublicIpCopy",
208
- "count": "[parameters('vmCount')]"
209
- },
210
- "sku": {
211
- "name": "Basic",
212
- "tier": "Regional"
213
- },
214
- "condition": "[parameters('provisionPublicIp')]"
215
- },
216
- {
217
- "type": "Microsoft.Compute/virtualMachines",
218
- "apiVersion": "2019-03-01",
219
- "name": "[concat(parameters('vmName'), copyIndex())]",
220
- "location": "[variables('location')]",
221
- "dependsOn": [
222
- "[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
223
- ],
224
- "copy": {
225
- "name": "VmCopy",
226
- "count": "[parameters('vmCount')]"
227
- },
228
- "tags": "[parameters('vmTags')]",
229
- "properties": {
230
- "hardwareProfile": {
231
- "vmSize": "[parameters('vmSize')]"
232
- },
233
- "storageProfile": {
234
- "osDisk": {
235
- "createOption": "fromImage",
236
- "managedDisk": {
237
- "storageAccountType": "[parameters('osDiskTier')]"
238
- },
239
- "diskSizeGB": "[parameters('osDiskSizeGB')]"
240
- },
241
- "imageReference": {
242
- "publisher": "[parameters('imagePublisher')]",
243
- "offer": "[parameters('imageOffer')]",
244
- "sku": "[parameters('imageSku')]",
245
- "version": "[parameters('imageVersion')]"
246
- }
247
- },
248
- "networkProfile": {
249
- "networkInterfaces": [
250
- {
251
- "id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
252
- }
253
- ]
254
- },
255
- "osProfile": {
256
- "computerName": "[concat(parameters('vmName'), copyIndex())]",
257
- "adminUsername": "[parameters('adminUsername')]",
258
- "adminPassword": "[parameters('publicKey')]",
259
- "linuxConfiguration": {
260
- "disablePasswordAuthentication": true,
261
- "ssh": {
262
- "publicKeys": [
263
- {
264
- "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
265
- "keyData": "[parameters('publicKey')]"
266
- }
267
- ]
268
- }
269
- },
270
- "customData": "[parameters('cloudInitSetupCommands')]"
271
- },
272
- "priority": "[parameters('priority')]",
273
- "billingProfile": "[parameters('billingProfile')]"
274
- },
275
- "identity": {
276
- "type": "UserAssigned",
277
- "userAssignedIdentities": {
278
- "[parameters('msi')]": {
279
- }
280
- }
281
- }
282
- }
283
- ],
284
- "outputs": {
285
- "publicIp": {
286
- "type": "array",
287
- "copy": {
288
- "count": "[parameters('vmCount')]",
289
- "input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
290
- },
291
- "condition": "[parameters('provisionPublicIp')]"
292
- },
293
- "privateIp": {
294
- "type": "array",
295
- "copy": {
296
- "count": "[parameters('vmCount')]",
297
- "input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
298
- }
299
- }
300
- }
301
- }