skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py ADDED
@@ -0,0 +1,307 @@
1
+ """Scheduler for managed jobs.
2
+
3
+ Once managed jobs are submitted via submit_job, the scheduler is responsible for
4
+ the business logic of deciding when they are allowed to start, and choosing the
5
+ right one to start. The scheduler will also schedule jobs that are already live
6
+ but waiting to launch a new task or recover.
7
+
8
+ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
9
+ be called from any code running on the managed jobs controller instance to
10
+ trigger scheduling of new jobs if possible. This function should be called
11
+ immediately after any state change that could result in jobs newly being able to
12
+ be scheduled.
13
+
14
+ The scheduling logic limits the number of running jobs according to two limits:
15
+ 1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
16
+ once, based on the number of CPUs. (See _get_launch_parallelism.) This the
17
+ most compute-intensive part of the job lifecycle, which is why we have an
18
+ additional limit.
19
+ 2. The number of jobs that can be running at any given time, based on the amount
20
+ of memory. (See _get_job_parallelism.) Since the job controller is doing very
21
+ little once a job starts (just checking its status periodically), the most
22
+ significant resource it consumes is memory.
23
+
24
+ The state of the scheduler is entirely determined by the schedule_state column
25
+ of all the jobs in the job_info table. This column should only be modified via
26
+ the functions defined in this file. We will always hold the lock while modifying
27
+ this state. See state.ManagedJobScheduleState.
28
+
29
+ Nomenclature:
30
+ - job: same as managed job (may include multiple tasks)
31
+ - launch/launching: launching a cluster (sky.launch) as part of a job
32
+ - start/run: create the job controller process for a job
33
+ - schedule: transition a job to the LAUNCHING state, whether a new job or a job
34
+ that is already alive
35
+ - alive: a job controller exists (includes multiple schedule_states: ALIVE,
36
+ ALIVE_WAITING, LAUNCHING)
37
+ """
38
+
39
+ from argparse import ArgumentParser
40
+ import contextlib
41
+ from functools import lru_cache
42
+ import os
43
+ import time
44
+
45
+ import filelock
46
+ import psutil
47
+
48
+ from sky import sky_logging
49
+ from sky.jobs import constants as managed_job_constants
50
+ from sky.jobs import state
51
+ from sky.skylet import constants
52
+ from sky.utils import common_utils
53
+ from sky.utils import subprocess_utils
54
+
55
+ logger = sky_logging.init_logger('sky.jobs.controller')
56
+
57
+ # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
58
+ # parallelism control or updating the schedule_state of any job.
59
+ # Any code that takes this lock must conclude by calling
60
+ # maybe_schedule_next_jobs.
61
+ _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
62
+ _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
63
+
64
+ # Based on testing, assume a running job uses 350MB memory.
65
+ JOB_MEMORY_MB = 350
66
+ # Past 2000 simultaneous jobs, we become unstable.
67
+ # See https://github.com/skypilot-org/skypilot/issues/4649.
68
+ MAX_JOB_LIMIT = 2000
69
+ # Number of ongoing launches launches allowed per CPU.
70
+ LAUNCHES_PER_CPU = 4
71
+
72
+
73
+ @lru_cache(maxsize=1)
74
+ def _get_lock_path() -> str:
75
+ path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
76
+ os.makedirs(os.path.dirname(path), exist_ok=True)
77
+ return path
78
+
79
+
80
+ def maybe_schedule_next_jobs() -> None:
81
+ """Determine if any managed jobs can be scheduled, and if so, schedule them.
82
+
83
+ Here, "schedule" means to select job that is waiting, and allow it to
84
+ proceed. It does NOT mean to submit a job to the scheduler.
85
+
86
+ For newly submitted jobs, scheduling means updating the state of the jobs,
87
+ and starting the job controller process. For jobs that are already alive but
88
+ are waiting to launch a new task or recover, just update the state of the
89
+ job to indicate that the launch can proceed.
90
+
91
+ This function transitions jobs into LAUNCHING on a best-effort basis. That
92
+ is, if we can start any jobs, we will, but if not, we will exit (almost)
93
+ immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
94
+ be started now (either because the lock is held, or because there are not
95
+ enough resources), another call to this function will be made whenever that
96
+ situation is resolved. (If the lock is held, the lock holder should start
97
+ the jobs. If there aren't enough resources, the next controller to exit and
98
+ free up resources should start the jobs.)
99
+
100
+ If this function obtains the lock, it will launch as many jobs as possible
101
+ before releasing the lock. This is what allows other calls to exit
102
+ immediately if the lock is held, while ensuring that all jobs are started as
103
+ soon as possible.
104
+
105
+ This uses subprocess_utils.launch_new_process_tree() to start the controller
106
+ processes, which should be safe to call from pretty much any code running on
107
+ the jobs controller instance. New job controller processes will be detached
108
+ from the current process and there will not be a parent/child relationship.
109
+ See launch_new_process_tree for more.
110
+ """
111
+ try:
112
+ # We must use a global lock rather than a per-job lock to ensure correct
113
+ # parallelism control. If we cannot obtain the lock, exit immediately.
114
+ # The current lock holder is expected to launch any jobs it can before
115
+ # releasing the lock.
116
+ with filelock.FileLock(_get_lock_path(), blocking=False):
117
+ while True:
118
+ maybe_next_job = state.get_waiting_job()
119
+ if maybe_next_job is None:
120
+ # Nothing left to start, break from scheduling loop
121
+ break
122
+
123
+ current_state = maybe_next_job['schedule_state']
124
+
125
+ assert current_state in (
126
+ state.ManagedJobScheduleState.ALIVE_WAITING,
127
+ state.ManagedJobScheduleState.WAITING), maybe_next_job
128
+
129
+ # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
130
+ # since they will have been submitted and therefore started
131
+ # first. The requirements to launch in an alive job are more
132
+ # lenient, so there is no way that we wouldn't be able to launch
133
+ # an ALIVE_WAITING job, but we would be able to launch a WAITING
134
+ # job.
135
+ if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
136
+ if not _can_lauch_in_alive_job():
137
+ # Can't schedule anything, break from scheduling loop.
138
+ break
139
+ elif current_state == state.ManagedJobScheduleState.WAITING:
140
+ if not _can_start_new_job():
141
+ # Can't schedule anything, break from scheduling loop.
142
+ break
143
+
144
+ logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
145
+ state.scheduler_set_launching(maybe_next_job['job_id'],
146
+ current_state)
147
+
148
+ if current_state == state.ManagedJobScheduleState.WAITING:
149
+ # The job controller has not been started yet. We must start
150
+ # it.
151
+
152
+ job_id = maybe_next_job['job_id']
153
+ dag_yaml_path = maybe_next_job['dag_yaml_path']
154
+
155
+ activate_python_env_cmd = (
156
+ f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
157
+ env_file = maybe_next_job['env_file_path']
158
+ source_environment_cmd = (f'source {env_file};'
159
+ if env_file else '')
160
+ run_controller_cmd = ('python -u -m sky.jobs.controller '
161
+ f'{dag_yaml_path} --job-id {job_id};')
162
+
163
+ # If the command line here is changed, please also update
164
+ # utils._controller_process_alive. `--job-id X` should be at
165
+ # the end.
166
+ run_cmd = (f'{activate_python_env_cmd}'
167
+ f'{source_environment_cmd}'
168
+ f'{run_controller_cmd}')
169
+
170
+ logs_dir = os.path.expanduser(
171
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
172
+ os.makedirs(logs_dir, exist_ok=True)
173
+ log_path = os.path.join(logs_dir, f'{job_id}.log')
174
+
175
+ pid = subprocess_utils.launch_new_process_tree(
176
+ run_cmd, log_output=log_path)
177
+ state.set_job_controller_pid(job_id, pid)
178
+
179
+ logger.debug(f'Job {job_id} started with pid {pid}')
180
+
181
+ except filelock.Timeout:
182
+ # If we can't get the lock, just exit. The process holding the lock
183
+ # should launch any pending jobs.
184
+ pass
185
+
186
+
187
+ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
188
+ """Submit an existing job to the scheduler.
189
+
190
+ This should be called after a job is created in the `spot` table as
191
+ PENDING. It will tell the scheduler to try and start the job controller, if
192
+ there are resources available. It may block to acquire the lock, so it
193
+ should not be on the critical path for `sky jobs launch -d`.
194
+
195
+ The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
196
+ """
197
+ with filelock.FileLock(_get_lock_path()):
198
+ state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
199
+ common_utils.get_user_hash())
200
+ maybe_schedule_next_jobs()
201
+
202
+
203
+ @contextlib.contextmanager
204
+ def scheduled_launch(job_id: int):
205
+ """Launch as part of an ongoing job.
206
+
207
+ A newly started job will already be LAUNCHING, and this will immediately
208
+ enter the context.
209
+
210
+ If a job is ongoing (ALIVE schedule_state), there are two scenarios where we
211
+ may need to call sky.launch again during the course of a job controller:
212
+ - for tasks after the first task
213
+ - for recovery
214
+
215
+ This function will mark the job as ALIVE_WAITING, which indicates to the
216
+ scheduler that it wants to transition back to LAUNCHING. Then, it will wait
217
+ until the scheduler transitions the job state, before entering the context.
218
+
219
+ On exiting the context, the job will transition to ALIVE.
220
+
221
+ This should only be used within the job controller for the given job_id. If
222
+ multiple uses of this context are nested, behavior is undefined. Don't do
223
+ that.
224
+ """
225
+
226
+ # If we're already in LAUNCHING schedule_state, we don't need to wait.
227
+ # This may be the case for the first launch of a job.
228
+ if (state.get_job_schedule_state(job_id) !=
229
+ state.ManagedJobScheduleState.LAUNCHING):
230
+ # Since we aren't LAUNCHING, we need to wait to be scheduled.
231
+ _set_alive_waiting(job_id)
232
+
233
+ while (state.get_job_schedule_state(job_id) !=
234
+ state.ManagedJobScheduleState.LAUNCHING):
235
+ time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
236
+
237
+ yield
238
+
239
+ with filelock.FileLock(_get_lock_path()):
240
+ state.scheduler_set_alive(job_id)
241
+ maybe_schedule_next_jobs()
242
+
243
+
244
+ def job_done(job_id: int, idempotent: bool = False) -> None:
245
+ """Transition a job to DONE.
246
+
247
+ If idempotent is True, this will not raise an error if the job is already
248
+ DONE.
249
+
250
+ The job could be in any terminal ManagedJobStatus. However, once DONE, it
251
+ should never transition back to another state.
252
+ """
253
+ if idempotent and (state.get_job_schedule_state(job_id)
254
+ == state.ManagedJobScheduleState.DONE):
255
+ return
256
+
257
+ with filelock.FileLock(_get_lock_path()):
258
+ state.scheduler_set_done(job_id, idempotent)
259
+ maybe_schedule_next_jobs()
260
+
261
+
262
+ def _set_alive_waiting(job_id: int) -> None:
263
+ """Should use wait_until_launch_okay() to transition to this state."""
264
+ with filelock.FileLock(_get_lock_path()):
265
+ state.scheduler_set_alive_waiting(job_id)
266
+ maybe_schedule_next_jobs()
267
+
268
+
269
+ def _get_job_parallelism() -> int:
270
+ job_memory = JOB_MEMORY_MB * 1024 * 1024
271
+
272
+ job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
273
+
274
+ return max(job_limit, 1)
275
+
276
+
277
+ def _get_launch_parallelism() -> int:
278
+ cpus = os.cpu_count()
279
+ return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
280
+
281
+
282
+ def _can_start_new_job() -> bool:
283
+ launching_jobs = state.get_num_launching_jobs()
284
+ alive_jobs = state.get_num_alive_jobs()
285
+ return launching_jobs < _get_launch_parallelism(
286
+ ) and alive_jobs < _get_job_parallelism()
287
+
288
+
289
+ def _can_lauch_in_alive_job() -> bool:
290
+ launching_jobs = state.get_num_launching_jobs()
291
+ return launching_jobs < _get_launch_parallelism()
292
+
293
+
294
+ if __name__ == '__main__':
295
+ parser = ArgumentParser()
296
+ parser.add_argument('dag_yaml',
297
+ type=str,
298
+ help='The path to the user job yaml file.')
299
+ parser.add_argument('--job-id',
300
+ required=True,
301
+ type=int,
302
+ help='Job id for the controller job.')
303
+ parser.add_argument('--env-file',
304
+ type=str,
305
+ help='The path to the controller env file.')
306
+ args = parser.parse_args()
307
+ submit_job(args.job_id, args.dag_yaml, args.env_file)
@@ -0,0 +1 @@
1
+