skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/skylet/job_lib.py CHANGED
@@ -8,27 +8,32 @@ import json
8
8
  import os
9
9
  import pathlib
10
10
  import shlex
11
- import subprocess
11
+ import signal
12
+ import sqlite3
12
13
  import time
13
- import typing
14
- from typing import Any, Dict, List, Optional, Tuple
14
+ from typing import Any, Dict, List, Optional, Sequence
15
15
 
16
16
  import colorama
17
17
  import filelock
18
18
  import psutil
19
19
 
20
+ from sky import global_user_state
20
21
  from sky import sky_logging
21
22
  from sky.skylet import constants
22
23
  from sky.utils import common_utils
23
24
  from sky.utils import db_utils
24
25
  from sky.utils import log_utils
25
-
26
- if typing.TYPE_CHECKING:
27
- from ray.dashboard.modules.job import pydantic_models as ray_pydantic
26
+ from sky.utils import message_utils
27
+ from sky.utils import subprocess_utils
28
28
 
29
29
  logger = sky_logging.init_logger(__name__)
30
30
 
31
+ _LINUX_NEW_LINE = '\n'
31
32
  _JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
33
+ # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
34
+ # with pid is the same driver process to guard against the case where
35
+ # the same pid is reused by a different process.
36
+ JOB_CMD_IDENTIFIER = 'echo "SKYPILOT_JOB_ID <{}>"'
32
37
 
33
38
 
34
39
  def _get_lock_path(job_id: int) -> str:
@@ -48,6 +53,7 @@ class JobInfoLoc(enum.IntEnum):
48
53
  START_AT = 6
49
54
  END_AT = 7
50
55
  RESOURCES = 8
56
+ PID = 9
51
57
 
52
58
 
53
59
  _DB_PATH = os.path.expanduser('~/.sky/jobs.db')
@@ -55,6 +61,31 @@ os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
55
61
 
56
62
 
57
63
  def create_table(cursor, conn):
64
+ # Enable WAL mode to avoid locking issues.
65
+ # See: issue #3863, #1441 and PR #1509
66
+ # https://github.com/microsoft/WSL/issues/2395
67
+ # TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
68
+ # This may cause the database locked problem from WSL issue #1441.
69
+ if not common_utils.is_wsl():
70
+ try:
71
+ cursor.execute('PRAGMA journal_mode=WAL')
72
+ except sqlite3.OperationalError as e:
73
+ if 'database is locked' not in str(e):
74
+ raise
75
+ # If the database is locked, it is OK to continue, as the WAL mode
76
+ # is not critical and is likely to be enabled by other processes.
77
+
78
+ # Pid column is used for keeping track of the driver process of a job. It
79
+ # can be in three states:
80
+ # -1: The job was submitted with SkyPilot older than #4318, where we use
81
+ # ray job submit to submit the job, i.e. no pid is recorded. This is for
82
+ # backward compatibility and should be removed after 0.10.0.
83
+ # 0: The job driver process has never been started. When adding a job with
84
+ # INIT state, the pid will be set to 0 (the default -1 value is just for
85
+ # backward compatibility).
86
+ # >=0: The job has been started. The pid is the driver process's pid.
87
+ # The driver can be actually running or finished.
88
+ # TODO(SKY-1213): username is actually user hash, should rename.
58
89
  cursor.execute("""\
59
90
  CREATE TABLE IF NOT EXISTS jobs (
60
91
  job_id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -63,7 +94,10 @@ def create_table(cursor, conn):
63
94
  submitted_at FLOAT,
64
95
  status TEXT,
65
96
  run_timestamp TEXT CANDIDATE KEY,
66
- start_at FLOAT DEFAULT -1)""")
97
+ start_at FLOAT DEFAULT -1,
98
+ end_at FLOAT DEFAULT NULL,
99
+ resources TEXT DEFAULT NULL,
100
+ pid INTEGER DEFAULT -1)""")
67
101
 
68
102
  cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
69
103
  job_id INTEGER,
@@ -74,7 +108,8 @@ def create_table(cursor, conn):
74
108
 
75
109
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'end_at', 'FLOAT')
76
110
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT')
77
-
111
+ db_utils.add_column_to_table(cursor, conn, 'jobs', 'pid',
112
+ 'INTEGER DEFAULT -1')
78
113
  conn.commit()
79
114
 
80
115
 
@@ -84,7 +119,7 @@ _CONN = _DB.conn
84
119
 
85
120
 
86
121
  class JobStatus(enum.Enum):
87
- """Job status"""
122
+ """Job status enum."""
88
123
 
89
124
  # 3 in-flux states: each can transition to any state below it.
90
125
  # The `job_id` has been generated, but the generated ray program has
@@ -93,54 +128,75 @@ class JobStatus(enum.Enum):
93
128
  # In the 'jobs' table, the `submitted_at` column will be set to the current
94
129
  # time, when the job is firstly created (in the INIT state).
95
130
  INIT = 'INIT'
131
+ """The job has been submitted, but not started yet."""
96
132
  # The job is waiting for the required resources. (`ray job status`
97
133
  # shows RUNNING as the generated ray program has started, but blocked
98
134
  # by the placement constraints.)
99
135
  PENDING = 'PENDING'
100
- # Running the user's setup script (only in effect if --detach-setup is
101
- # set). Our update_job_status() can temporarily (for a short period) set
136
+ """The job is waiting for required resources."""
137
+ # Running the user's setup script.
138
+ # Our update_job_status() can temporarily (for a short period) set
102
139
  # the status to SETTING_UP, if the generated ray program has not set
103
140
  # the status to PENDING or RUNNING yet.
104
141
  SETTING_UP = 'SETTING_UP'
142
+ """The job is running the user's setup script."""
105
143
  # The job is running.
106
144
  # In the 'jobs' table, the `start_at` column will be set to the current
107
145
  # time, when the job is firstly transitioned to RUNNING.
108
146
  RUNNING = 'RUNNING'
147
+ """The job is running."""
148
+ # The job driver process failed. This happens when the job driver process
149
+ # finishes when the status in job table is still not set to terminal state.
150
+ # We should keep this state before the SUCCEEDED, as our job status update
151
+ # relies on the order of the statuses to keep the latest status.
152
+ FAILED_DRIVER = 'FAILED_DRIVER'
153
+ """The job driver process failed."""
109
154
  # 3 terminal states below: once reached, they do not transition.
110
155
  # The job finished successfully.
111
156
  SUCCEEDED = 'SUCCEEDED'
157
+ """The job finished successfully."""
112
158
  # The job fails due to the user code or a system restart.
113
159
  FAILED = 'FAILED'
114
- # The job setup failed (only in effect if --detach-setup is set). It
115
- # needs to be placed after the `FAILED` state, so that the status
116
- # set by our generated ray program will not be overwritten by
117
- # ray's job status (FAILED).
118
- # This is for a better UX, so that the user can find out the reason
119
- # of the failure quickly.
160
+ """The job fails due to the user code."""
161
+ # The job setup failed. It needs to be placed after the `FAILED` state,
162
+ # so that the status set by our generated ray program will not be
163
+ # overwritten by ray's job status (FAILED). This is for a better UX, so
164
+ # that the user can find out the reason of the failure quickly.
120
165
  FAILED_SETUP = 'FAILED_SETUP'
166
+ """The job setup failed."""
121
167
  # The job is cancelled by the user.
122
168
  CANCELLED = 'CANCELLED'
169
+ """The job is cancelled by the user."""
123
170
 
124
171
  @classmethod
125
172
  def nonterminal_statuses(cls) -> List['JobStatus']:
126
173
  return [cls.INIT, cls.SETTING_UP, cls.PENDING, cls.RUNNING]
127
174
 
128
- def is_terminal(self):
175
+ def is_terminal(self) -> bool:
129
176
  return self not in self.nonterminal_statuses()
130
177
 
131
- def __lt__(self, other):
178
+ @classmethod
179
+ def user_code_failure_states(cls) -> Sequence['JobStatus']:
180
+ return (cls.FAILED, cls.FAILED_SETUP)
181
+
182
+ def __lt__(self, other: 'JobStatus') -> bool:
132
183
  return list(JobStatus).index(self) < list(JobStatus).index(other)
133
184
 
134
- def colored_str(self):
185
+ def colored_str(self) -> str:
135
186
  color = _JOB_STATUS_TO_COLOR[self]
136
187
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
137
188
 
138
189
 
139
- # Only update status of the jobs after this many seconds of job submission,
140
- # to avoid race condition with `ray job` to make sure it job has been
141
- # correctly updated.
190
+ # We have two steps for job submissions:
191
+ # 1. Client reserve a job id from the job table by adding a INIT state job.
192
+ # 2. Client updates the job status to PENDING by actually submitting the job's
193
+ # command to the scheduler.
194
+ # In normal cases, the two steps happens very close to each other through two
195
+ # consecutive SSH connections.
196
+ # We should update status for INIT job that has been staying in INIT state for
197
+ # a while (60 seconds), which likely fails to reach step 2.
142
198
  # TODO(zhwu): This number should be tuned based on heuristics.
143
- _PENDING_SUBMIT_GRACE_PERIOD = 60
199
+ _INIT_SUBMIT_GRACE_PERIOD = 60
144
200
 
145
201
  _PRE_RESOURCE_STATUSES = [JobStatus.PENDING]
146
202
 
@@ -163,17 +219,38 @@ class JobScheduler:
163
219
  _CURSOR.execute((f'UPDATE pending_jobs SET submit={int(time.time())} '
164
220
  f'WHERE job_id={job_id!r}'))
165
221
  _CONN.commit()
166
- subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL)
222
+ pid = subprocess_utils.launch_new_process_tree(run_cmd)
223
+ # TODO(zhwu): Backward compatibility, remove this check after 0.10.0.
224
+ # This is for the case where the job is submitted with SkyPilot older
225
+ # than #4318, using ray job submit.
226
+ if 'job submit' in run_cmd:
227
+ pid = -1
228
+ _CURSOR.execute((f'UPDATE jobs SET pid={pid} '
229
+ f'WHERE job_id={job_id!r}'))
230
+ _CONN.commit()
167
231
 
168
232
  def schedule_step(self, force_update_jobs: bool = False) -> None:
169
- jobs = self._get_jobs()
170
- if len(jobs) > 0 or force_update_jobs:
233
+ if force_update_jobs:
171
234
  update_status()
235
+ pending_job_ids = self._get_pending_job_ids()
172
236
  # TODO(zhwu, mraheja): One optimization can be allowing more than one
173
237
  # job staying in the pending state after ray job submit, so that to be
174
238
  # faster to schedule a large amount of jobs.
175
- for job_id, run_cmd, submit, created_time in jobs:
239
+ for job_id in pending_job_ids:
176
240
  with filelock.FileLock(_get_lock_path(job_id)):
241
+ pending_job = _get_pending_job(job_id)
242
+ if pending_job is None:
243
+ # Pending job can be removed by another thread, due to the
244
+ # job being scheduled already.
245
+ continue
246
+ run_cmd = pending_job['run_cmd']
247
+ submit = pending_job['submit']
248
+ created_time = pending_job['created_time']
249
+ # We don't have to refresh the job status before checking, as
250
+ # the job status will only be stale in rare cases where ray job
251
+ # crashes; or the job stays in INIT state for a long time.
252
+ # In those cases, the periodic JobSchedulerEvent event will
253
+ # update the job status every 300 seconds.
177
254
  status = get_status_no_lock(job_id)
178
255
  if (status not in _PRE_RESOURCE_STATUSES or
179
256
  created_time < psutil.boot_time()):
@@ -187,8 +264,8 @@ class JobScheduler:
187
264
  self._run_job(job_id, run_cmd)
188
265
  return
189
266
 
190
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
191
- """Returns the metadata for jobs in the pending jobs table
267
+ def _get_pending_job_ids(self) -> List[int]:
268
+ """Returns the job ids in the pending jobs table
192
269
 
193
270
  The information contains job_id, run command, submit time,
194
271
  creation time.
@@ -199,9 +276,10 @@ class JobScheduler:
199
276
  class FIFOScheduler(JobScheduler):
200
277
  """First in first out job scheduler"""
201
278
 
202
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
203
- return list(
204
- _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
279
+ def _get_pending_job_ids(self) -> List[int]:
280
+ rows = _CURSOR.execute(
281
+ 'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
282
+ return [row[0] for row in rows]
205
283
 
206
284
 
207
285
  scheduler = FIFOScheduler()
@@ -211,59 +289,13 @@ _JOB_STATUS_TO_COLOR = {
211
289
  JobStatus.SETTING_UP: colorama.Fore.BLUE,
212
290
  JobStatus.PENDING: colorama.Fore.BLUE,
213
291
  JobStatus.RUNNING: colorama.Fore.GREEN,
292
+ JobStatus.FAILED_DRIVER: colorama.Fore.RED,
214
293
  JobStatus.SUCCEEDED: colorama.Fore.GREEN,
215
294
  JobStatus.FAILED: colorama.Fore.RED,
216
295
  JobStatus.FAILED_SETUP: colorama.Fore.RED,
217
296
  JobStatus.CANCELLED: colorama.Fore.YELLOW,
218
297
  }
219
298
 
220
- _RAY_TO_JOB_STATUS_MAP = {
221
- # These are intentionally set this way, because:
222
- # 1. when the ray status indicates the job is PENDING the generated
223
- # python program has been `ray job submit` from the job queue
224
- # and is now PENDING
225
- # 2. when the ray status indicates the job is RUNNING the job can be in
226
- # setup or resources may not be allocated yet, i.e. the job should be
227
- # PENDING.
228
- # For case 2, update_job_status() would compare this mapped PENDING to
229
- # the status in our jobs DB and take the max. This is because the job's
230
- # generated ray program is the only place that can determine a job has
231
- # reserved resources and actually started running: it will set the
232
- # status in the DB to SETTING_UP or RUNNING.
233
- # If there is no setup specified in the task, as soon as it is started
234
- # (ray's status becomes RUNNING), i.e. it will be very rare that the job
235
- # will be set to SETTING_UP by the update_job_status, as our generated
236
- # ray program will set the status to PENDING immediately.
237
- 'PENDING': JobStatus.PENDING,
238
- 'RUNNING': JobStatus.PENDING,
239
- 'SUCCEEDED': JobStatus.SUCCEEDED,
240
- 'FAILED': JobStatus.FAILED,
241
- 'STOPPED': JobStatus.CANCELLED,
242
- }
243
-
244
-
245
- def _create_ray_job_submission_client():
246
- """Import the ray job submission client."""
247
- try:
248
- import ray # pylint: disable=import-outside-toplevel
249
- except ImportError:
250
- logger.error('Failed to import ray')
251
- raise
252
- try:
253
- # pylint: disable=import-outside-toplevel
254
- from ray import job_submission
255
- except ImportError:
256
- logger.error(
257
- f'Failed to import job_submission with ray=={ray.__version__}')
258
- raise
259
- port = get_job_submission_port()
260
- return job_submission.JobSubmissionClient(
261
- address=f'http://127.0.0.1:{port}')
262
-
263
-
264
- def make_ray_job_id(sky_job_id: int) -> str:
265
- return f'{sky_job_id}-{getpass.getuser()}'
266
-
267
299
 
268
300
  def make_job_command_with_user_switching(username: str,
269
301
  command: str) -> List[str]:
@@ -275,9 +307,10 @@ def add_job(job_name: str, username: str, run_timestamp: str,
275
307
  """Atomically reserve the next available job id for the user."""
276
308
  job_submitted_at = time.time()
277
309
  # job_id will autoincrement with the null value
278
- _CURSOR.execute('INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?)',
279
- (job_name, username, job_submitted_at, JobStatus.INIT.value,
280
- run_timestamp, None, resources_str))
310
+ _CURSOR.execute(
311
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0)',
312
+ (job_name, username, job_submitted_at, JobStatus.INIT.value,
313
+ run_timestamp, None, resources_str))
281
314
  _CONN.commit()
282
315
  rows = _CURSOR.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
283
316
  (run_timestamp,))
@@ -362,12 +395,12 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
362
395
  statuses = {job_id: None for job_id in job_ids}
363
396
  for (job_id, status) in rows:
364
397
  statuses[job_id] = status
365
- return common_utils.encode_payload(statuses)
398
+ return message_utils.encode_payload(statuses)
366
399
 
367
400
 
368
401
  def load_statuses_payload(
369
402
  statuses_payload: str) -> Dict[Optional[int], Optional[JobStatus]]:
370
- original_statuses = common_utils.decode_payload(statuses_payload)
403
+ original_statuses = message_utils.decode_payload(statuses_payload)
371
404
  statuses = dict()
372
405
  for job_id, status in original_statuses.items():
373
406
  # json.dumps will convert all keys to strings. Integers will
@@ -405,8 +438,8 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
405
438
  rows = _CURSOR.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
406
439
  (job_id,))
407
440
  for (timestamp,) in rows:
408
- return common_utils.encode_payload(timestamp)
409
- return common_utils.encode_payload(None)
441
+ return message_utils.encode_payload(timestamp)
442
+ return message_utils.encode_payload(None)
410
443
 
411
444
 
412
445
  def get_ray_port():
@@ -452,35 +485,26 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
452
485
  'start_at': row[JobInfoLoc.START_AT.value],
453
486
  'end_at': row[JobInfoLoc.END_AT.value],
454
487
  'resources': row[JobInfoLoc.RESOURCES.value],
488
+ 'pid': row[JobInfoLoc.PID.value],
455
489
  })
456
490
  return records
457
491
 
458
492
 
459
493
  def _get_jobs(
460
- username: Optional[str],
494
+ user_hash: Optional[str],
461
495
  status_list: Optional[List[JobStatus]] = None) -> List[Dict[str, Any]]:
462
496
  """Returns jobs with the given fields, sorted by job_id, descending."""
463
497
  if status_list is None:
464
498
  status_list = list(JobStatus)
465
- status_str_list = [status.value for status in status_list]
466
- if username is None:
467
- rows = _CURSOR.execute(
468
- f"""\
469
- SELECT * FROM jobs
470
- WHERE status IN ({','.join(['?'] * len(status_list))})
471
- ORDER BY job_id DESC""",
472
- (*status_str_list,),
473
- )
474
- else:
475
- rows = _CURSOR.execute(
476
- f"""\
477
- SELECT * FROM jobs
478
- WHERE status IN ({','.join(['?'] * len(status_list))})
479
- AND username=(?)
480
- ORDER BY job_id DESC""",
481
- (*status_str_list, username),
482
- )
483
-
499
+ status_str_list = [repr(status.value) for status in status_list]
500
+ filter_str = f'WHERE status IN ({",".join(status_str_list)})'
501
+ params = []
502
+ if user_hash is not None:
503
+ # We use the old username field for compatibility.
504
+ filter_str += ' AND username=(?)'
505
+ params.append(user_hash)
506
+ rows = _CURSOR.execute(
507
+ f'SELECT * FROM jobs {filter_str} ORDER BY job_id DESC', params)
484
508
  records = _get_records_from_rows(rows)
485
509
  return records
486
510
 
@@ -497,16 +521,35 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
497
521
  return records
498
522
 
499
523
 
500
- def _get_pending_jobs():
524
+ def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
501
525
  rows = _CURSOR.execute(
502
- 'SELECT job_id, created_time, submit FROM pending_jobs')
503
- rows = list(rows)
504
- return {
505
- job_id: {
526
+ 'SELECT created_time, submit, run_cmd FROM pending_jobs '
527
+ f'WHERE job_id={job_id!r}')
528
+ for row in rows:
529
+ created_time, submit, run_cmd = row
530
+ return {
506
531
  'created_time': created_time,
507
- 'submit': submit
508
- } for job_id, created_time, submit in rows
509
- }
532
+ 'submit': submit,
533
+ 'run_cmd': run_cmd
534
+ }
535
+ return None
536
+
537
+
538
+ def _is_job_driver_process_running(job_pid: int, job_id: int) -> bool:
539
+ """Check if the job driver process is running.
540
+
541
+ We check the cmdline to avoid the case where the same pid is reused by a
542
+ different process.
543
+ """
544
+ if job_pid <= 0:
545
+ return False
546
+ try:
547
+ job_process = psutil.Process(job_pid)
548
+ return job_process.is_running() and any(
549
+ JOB_CMD_IDENTIFIER.format(job_id) in line
550
+ for line in job_process.cmdline())
551
+ except psutil.NoSuchProcess:
552
+ return False
510
553
 
511
554
 
512
555
  def update_job_status(job_ids: List[int],
@@ -520,94 +563,131 @@ def update_job_status(job_ids: List[int],
520
563
  during job cancelling, we still need this to handle the staleness problem,
521
564
  caused by instance restarting and other corner cases (if any).
522
565
 
523
- This function should only be run on the remote instance with ray==2.4.0.
566
+ This function should only be run on the remote instance with ray>=2.4.0.
524
567
  """
525
- if len(job_ids) == 0:
568
+ echo = logger.info if not silent else logger.debug
569
+ if not job_ids:
526
570
  return []
527
571
 
528
- # TODO: if too slow, directly query against redis.
529
- ray_job_ids = [make_ray_job_id(job_id) for job_id in job_ids]
530
-
531
- job_client = _create_ray_job_submission_client()
532
-
533
- # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
534
- # which contains the job status (str) and submission_id (str).
535
- job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
536
-
537
- pending_jobs = _get_pending_jobs()
538
- job_details = {}
539
- ray_job_ids_set = set(ray_job_ids)
540
- for job_detail in job_detail_lists:
541
- if job_detail.submission_id in ray_job_ids_set:
542
- job_details[job_detail.submission_id] = job_detail
543
- job_statuses: List[Optional[JobStatus]] = [None] * len(ray_job_ids)
544
- for i, ray_job_id in enumerate(ray_job_ids):
545
- job_id = job_ids[i]
546
- if ray_job_id in job_details:
547
- ray_status = job_details[ray_job_id].status
548
- job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status]
549
- if job_id in pending_jobs:
550
- if pending_jobs[job_id]['created_time'] < psutil.boot_time():
551
- logger.info(
552
- f'Job {job_id} is stale, setting to FAILED: '
553
- f'created_time={pending_jobs[job_id]["created_time"]}, '
554
- f'boot_time={psutil.boot_time()}')
555
- # The job is stale as it is created before the instance
556
- # is booted, e.g. the instance is rebooted.
557
- job_statuses[i] = JobStatus.FAILED
558
- # Gives a 60 second grace period between job being submit from
559
- # the pending table until appearing in ray jobs.
560
- if (pending_jobs[job_id]['submit'] > 0 and
561
- pending_jobs[job_id]['submit'] <
562
- time.time() - _PENDING_SUBMIT_GRACE_PERIOD):
563
- # For jobs submitted outside of the grace period, we will
564
- # consider the ray job status.
565
- continue
566
- else:
567
- # Reset the job status to PENDING even though it may not appear
568
- # in the ray jobs, so that it will not be considered as stale.
569
- job_statuses[i] = JobStatus.PENDING
570
-
571
- assert len(job_statuses) == len(job_ids), (job_statuses, job_ids)
572
-
573
572
  statuses = []
574
- for job_id, status in zip(job_ids, job_statuses):
573
+ for job_id in job_ids:
575
574
  # Per-job status lock is required because between the job status
576
575
  # query and the job status update, the job status in the databse
577
576
  # can be modified by the generated ray program.
578
577
  with filelock.FileLock(_get_lock_path(job_id)):
579
- original_status = get_status_no_lock(job_id)
578
+ status = None
579
+ job_record = _get_jobs_by_ids([job_id])[0]
580
+ original_status = job_record['status']
581
+ job_submitted_at = job_record['submitted_at']
582
+ job_pid = job_record['pid']
583
+
584
+ pid_query_time = time.time()
585
+ failed_driver_transition_message = None
586
+ if original_status == JobStatus.INIT:
587
+ if (job_submitted_at >= psutil.boot_time() and job_submitted_at
588
+ >= pid_query_time - _INIT_SUBMIT_GRACE_PERIOD):
589
+ # The job id is reserved, but the job is not submitted yet.
590
+ # We should keep it in INIT.
591
+ status = JobStatus.INIT
592
+ else:
593
+ # We always immediately submit job after the job id is
594
+ # allocated, i.e. INIT -> PENDING, if a job stays in INIT
595
+ # for too long, it is likely the job submission process
596
+ # was killed before the job is submitted. We should set it
597
+ # to FAILED then. Note, if ray job indicates the job is
598
+ # running, we will change status to PENDING below.
599
+ failed_driver_transition_message = (
600
+ f'INIT job {job_id} is stale, setting to FAILED_DRIVER')
601
+ status = JobStatus.FAILED_DRIVER
602
+
603
+ # job_pid is 0 if the job is not submitted yet.
604
+ # job_pid is -1 if the job is submitted with SkyPilot older than
605
+ # #4318, using ray job submit. We skip the checking for those
606
+ # jobs.
607
+ if job_pid > 0:
608
+ if _is_job_driver_process_running(job_pid, job_id):
609
+ status = JobStatus.PENDING
610
+ else:
611
+ # By default, if the job driver process does not exist,
612
+ # the actual SkyPilot job is one of the following:
613
+ # 1. Still pending to be submitted.
614
+ # 2. Submitted and finished.
615
+ # 3. Driver failed without correctly setting the job
616
+ # status in the job table.
617
+ # Although we set the status to FAILED_DRIVER, it can be
618
+ # overridden to PENDING if the job is not submitted, or
619
+ # any other terminal status if the job driver process
620
+ # finished correctly.
621
+ failed_driver_transition_message = (
622
+ f'Job {job_id} driver process is not running, but '
623
+ 'the job state is not in terminal states, setting '
624
+ 'it to FAILED_DRIVER')
625
+ status = JobStatus.FAILED_DRIVER
626
+ elif job_pid < 0:
627
+ # TODO(zhwu): Backward compatibility, remove after 0.9.0.
628
+ # We set the job status to PENDING instead of actually
629
+ # checking ray job status and let the status in job table
630
+ # take effect in the later max.
631
+ status = JobStatus.PENDING
632
+
633
+ pending_job = _get_pending_job(job_id)
634
+ if pending_job is not None:
635
+ if pending_job['created_time'] < psutil.boot_time():
636
+ failed_driver_transition_message = (
637
+ f'Job {job_id} is stale, setting to FAILED_DRIVER: '
638
+ f'created_time={pending_job["created_time"]}, '
639
+ f'boot_time={psutil.boot_time()}')
640
+ # The job is stale as it is created before the instance
641
+ # is booted, e.g. the instance is rebooted.
642
+ status = JobStatus.FAILED_DRIVER
643
+ elif pending_job['submit'] <= 0:
644
+ # The job is not submitted (submit <= 0), we set it to
645
+ # PENDING.
646
+ # For submitted jobs, the driver should have been started,
647
+ # because the job_lib.JobScheduler.schedule_step() have
648
+ # the submit field and driver process pid set in the same
649
+ # job lock.
650
+ # The job process check in the above section should
651
+ # correctly figured out the status and we don't overwrite
652
+ # it here. (Note: the FAILED_DRIVER status will be
653
+ # overridden by the actual job terminal status in the table
654
+ # if the job driver process finished correctly.)
655
+ status = JobStatus.PENDING
656
+
580
657
  assert original_status is not None, (job_id, status)
581
658
  if status is None:
659
+ # The job is submitted but the job driver process pid is not
660
+ # set in the database. This is guarding against the case where
661
+ # the schedule_step() function is interrupted (e.g., VM stop)
662
+ # at the middle of starting a new process and setting the pid.
582
663
  status = original_status
583
664
  if (original_status is not None and
584
665
  not original_status.is_terminal()):
585
- logger.info(f'Ray job status for job {job_id} is None, '
586
- 'setting it to FAILED.')
587
- # The job may be stale, when the instance is restarted
588
- # (the ray redis is volatile). We need to reset the
589
- # status of the task to FAILED if its original status
590
- # is RUNNING or PENDING.
591
- status = JobStatus.FAILED
666
+ echo(f'Job {job_id} status is None, setting it to '
667
+ 'FAILED_DRIVER.')
668
+ # The job may be stale, when the instance is restarted. We
669
+ # need to reset the job status to FAILED_DRIVER if its
670
+ # original status is in nonterminal_statuses.
671
+ echo(f'Job {job_id} is in a unknown state, setting it to '
672
+ 'FAILED_DRIVER')
673
+ status = JobStatus.FAILED_DRIVER
592
674
  _set_status_no_lock(job_id, status)
593
- if not silent:
594
- logger.info(f'Updated job {job_id} status to {status}')
595
675
  else:
596
676
  # Taking max of the status is necessary because:
597
- # 1. It avoids race condition, where the original status has
598
- # already been set to later state by the job. We skip the
599
- # update.
600
- # 2. _RAY_TO_JOB_STATUS_MAP would map `ray job status`'s
601
- # `RUNNING` to our JobStatus.SETTING_UP; if a job has already
602
- # been set to JobStatus.PENDING or JobStatus.RUNNING by the
603
- # generated ray program, `original_status` (job status from our
604
- # DB) would already have that value. So we take the max here to
605
- # keep it at later status.
677
+ # 1. The original status has already been set to later
678
+ # terminal state by a finished job driver.
679
+ # 2. Job driver process check would map any running job process
680
+ # to `PENDING`, so we need to take the max to keep it at
681
+ # later status for jobs actually started in SETTING_UP or
682
+ # RUNNING.
606
683
  status = max(status, original_status)
684
+ assert status is not None, (job_id, status, original_status)
607
685
  if status != original_status: # Prevents redundant update.
608
686
  _set_status_no_lock(job_id, status)
609
- if not silent:
610
- logger.info(f'Updated job {job_id} status to {status}')
687
+ echo(f'Updated job {job_id} status to {status}')
688
+ if (status == JobStatus.FAILED_DRIVER and
689
+ failed_driver_transition_message is not None):
690
+ echo(failed_driver_transition_message)
611
691
  statuses.append(status)
612
692
  return statuses
613
693
 
@@ -620,18 +700,14 @@ def fail_all_jobs_in_progress() -> None:
620
700
  f"""\
621
701
  UPDATE jobs SET status=(?)
622
702
  WHERE status IN ({','.join(['?'] * len(in_progress_status))})
623
- """, (JobStatus.FAILED.value, *in_progress_status))
703
+ """, (JobStatus.FAILED_DRIVER.value, *in_progress_status))
624
704
  _CONN.commit()
625
705
 
626
706
 
627
707
  def update_status() -> None:
628
708
  # This will be called periodically by the skylet to update the status
629
709
  # of the jobs in the database, to avoid stale job status.
630
- # NOTE: there might be a INIT job in the database set to FAILED by this
631
- # function, as the ray job status does not exist due to the app
632
- # not submitted yet. It will be then reset to PENDING / RUNNING when the
633
- # app starts.
634
- nonterminal_jobs = _get_jobs(username=None,
710
+ nonterminal_jobs = _get_jobs(user_hash=None,
635
711
  status_list=JobStatus.nonterminal_statuses())
636
712
  nonterminal_job_ids = [job['job_id'] for job in nonterminal_jobs]
637
713
 
@@ -661,13 +737,14 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
661
737
  print(format_job_queue(jobs))
662
738
  """
663
739
  job_table = log_utils.create_table([
664
- 'ID', 'NAME', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES', 'STATUS',
665
- 'LOG'
740
+ 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
741
+ 'STATUS', 'LOG'
666
742
  ])
667
743
  for job in jobs:
668
744
  job_table.add_row([
669
745
  job['job_id'],
670
746
  job['job_name'],
747
+ job['username'],
671
748
  log_utils.readable_time_duration(job['submitted_at']),
672
749
  log_utils.readable_time_duration(job['start_at']),
673
750
  log_utils.readable_time_duration(job['start_at'],
@@ -680,11 +757,11 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
680
757
  return job_table
681
758
 
682
759
 
683
- def dump_job_queue(username: Optional[str], all_jobs: bool) -> str:
760
+ def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
684
761
  """Get the job queue in encoded json format.
685
762
 
686
763
  Args:
687
- username: The username to show jobs for. Show all the users if None.
764
+ user_hash: The user hash to show jobs for. Show all the users if None.
688
765
  all_jobs: Whether to show all jobs, not just the pending/running ones.
689
766
  """
690
767
  status_list: Optional[List[JobStatus]] = [
@@ -693,12 +770,12 @@ def dump_job_queue(username: Optional[str], all_jobs: bool) -> str:
693
770
  if all_jobs:
694
771
  status_list = None
695
772
 
696
- jobs = _get_jobs(username, status_list=status_list)
773
+ jobs = _get_jobs(user_hash, status_list=status_list)
697
774
  for job in jobs:
698
775
  job['status'] = job['status'].value
699
776
  job['log_path'] = os.path.join(constants.SKY_LOGS_DIRECTORY,
700
777
  job.pop('run_timestamp'))
701
- return common_utils.encode_payload(jobs)
778
+ return message_utils.encode_payload(jobs)
702
779
 
703
780
 
704
781
  def load_job_queue(payload: str) -> List[Dict[str, Any]]:
@@ -707,68 +784,118 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
707
784
  Args:
708
785
  payload: The encoded payload string to load.
709
786
  """
710
- jobs = common_utils.decode_payload(payload)
787
+ jobs = message_utils.decode_payload(payload)
711
788
  for job in jobs:
712
789
  job['status'] = JobStatus(job['status'])
790
+ job['user_hash'] = job['username']
791
+ job['username'] = global_user_state.get_user(job['user_hash']).name
713
792
  return jobs
714
793
 
715
794
 
795
+ # TODO(zhwu): Backward compatibility for jobs submitted before #4318, remove
796
+ # after 0.10.0.
797
+ def _create_ray_job_submission_client():
798
+ """Import the ray job submission client."""
799
+ try:
800
+ import ray # pylint: disable=import-outside-toplevel
801
+ except ImportError:
802
+ logger.error('Failed to import ray')
803
+ raise
804
+ try:
805
+ # pylint: disable=import-outside-toplevel
806
+ from ray import job_submission
807
+ except ImportError:
808
+ logger.error(
809
+ f'Failed to import job_submission with ray=={ray.__version__}')
810
+ raise
811
+ port = get_job_submission_port()
812
+ return job_submission.JobSubmissionClient(
813
+ address=f'http://127.0.0.1:{port}')
814
+
815
+
816
+ def _make_ray_job_id(sky_job_id: int) -> str:
817
+ return f'{sky_job_id}-{getpass.getuser()}'
818
+
819
+
716
820
  def cancel_jobs_encoded_results(jobs: Optional[List[int]],
717
- cancel_all: bool = False) -> str:
821
+ cancel_all: bool = False,
822
+ user_hash: Optional[str] = None) -> str:
718
823
  """Cancel jobs.
719
824
 
720
825
  Args:
721
- jobs: Job IDs to cancel. (See `cancel_all` for special semantics.)
722
- cancel_all: Whether to cancel all jobs. If True, asserts `jobs` is
723
- set to None. If False and `jobs` is None, cancel the latest
724
- running job.
826
+ jobs: Job IDs to cancel.
827
+ cancel_all: Whether to cancel all jobs.
828
+ user_hash: If specified, cancels the jobs for the specified user only.
829
+ Otherwise, applies to all users.
725
830
 
726
831
  Returns:
727
832
  Encoded job IDs that are actually cancelled. Caller should use
728
- common_utils.decode_payload() to parse.
833
+ message_utils.decode_payload() to parse.
729
834
  """
730
- if cancel_all:
731
- # Cancel all in-progress jobs.
732
- assert jobs is None, ('If cancel_all=True, usage is to set jobs=None')
733
- job_records = _get_jobs(
734
- None, [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING])
735
- else:
736
- if jobs is None:
737
- # Cancel the latest (largest job ID) running job.
738
- job_records = _get_jobs(None, [JobStatus.RUNNING])[:1]
739
- else:
740
- # Cancel jobs with specified IDs.
741
- job_records = _get_jobs_by_ids(jobs)
742
-
743
- # TODO(zhwu): `job_client.stop_job` will wait for the jobs to be killed, but
744
- # when the memory is not enough, this will keep waiting.
745
- job_client = _create_ray_job_submission_client()
746
- cancelled_ids = []
835
+ job_records = []
836
+ all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
837
+ if jobs is None and not cancel_all:
838
+ # Cancel the latest (largest job ID) running job from current user.
839
+ job_records = _get_jobs(user_hash, [JobStatus.RUNNING])[:1]
840
+ elif cancel_all:
841
+ job_records = _get_jobs(user_hash, all_status)
842
+ if jobs is not None:
843
+ job_records.extend(_get_jobs_by_ids(jobs))
747
844
 
845
+ cancelled_ids = []
748
846
  # Sequentially cancel the jobs to avoid the resource number bug caused by
749
847
  # ray cluster (tracked in #1262).
750
- for job in job_records:
751
- job_id = make_ray_job_id(job['job_id'])
848
+ for job_record in job_records:
849
+ job_id = job_record['job_id']
752
850
  # Job is locked to ensure that pending queue does not start it while
753
851
  # it is being cancelled
754
- with filelock.FileLock(_get_lock_path(job['job_id'])):
755
- try:
756
- job_client.stop_job(job_id)
757
- except RuntimeError as e:
758
- # If the request to the job server fails, we should not
759
- # set the job to CANCELLED.
760
- if 'does not exist' not in str(e):
761
- logger.warning(str(e))
762
- continue
763
-
764
- if job['status'] in [
852
+ with filelock.FileLock(_get_lock_path(job_id)):
853
+ job = _get_jobs_by_ids([job_id])[0]
854
+ if _is_job_driver_process_running(job['pid'], job_id):
855
+ # Not use process.terminate() as that will only terminate the
856
+ # process shell process, not the ray driver process
857
+ # under the shell.
858
+ #
859
+ # We don't kill all the children of the process, like
860
+ # subprocess_utils.kill_process_daemon() does, but just the
861
+ # process group here, because the underlying job driver can
862
+ # start other jobs with `schedule_step`, causing the other job
863
+ # driver processes to be children of the current job driver
864
+ # process.
865
+ #
866
+ # Killing the process group is enough as the underlying job
867
+ # should be able to clean itself up correctly by ray driver.
868
+ #
869
+ # The process group pid should be the same as the job pid as we
870
+ # use start_new_session=True, but we use os.getpgid() to be
871
+ # extra cautious.
872
+ job_pgid = os.getpgid(job['pid'])
873
+ os.killpg(job_pgid, signal.SIGTERM)
874
+ # We don't have to start a daemon to forcefully kill the process
875
+ # as our job driver process will clean up the underlying
876
+ # child processes.
877
+ elif job['pid'] < 0:
878
+ try:
879
+ # TODO(zhwu): Backward compatibility, remove after 0.9.0.
880
+ # The job was submitted with ray job submit before #4318.
881
+ job_client = _create_ray_job_submission_client()
882
+ job_client.stop_job(_make_ray_job_id(job['job_id']))
883
+ except RuntimeError as e:
884
+ # If the request to the job server fails, we should not
885
+ # set the job to CANCELLED.
886
+ if 'does not exist' not in str(e):
887
+ logger.warning(str(e))
888
+ continue
889
+ # Get the job status again to avoid race condition.
890
+ job_status = get_status_no_lock(job['job_id'])
891
+ if job_status in [
765
892
  JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING
766
893
  ]:
767
894
  _set_status_no_lock(job['job_id'], JobStatus.CANCELLED)
768
895
  cancelled_ids.append(job['job_id'])
769
896
 
770
897
  scheduler.schedule_step()
771
- return common_utils.encode_payload(cancelled_ids)
898
+ return message_utils.encode_payload(cancelled_ids)
772
899
 
773
900
 
774
901
  def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
@@ -797,7 +924,7 @@ def run_timestamp_with_globbing_payload(job_ids: List[Optional[str]]) -> str:
797
924
  job_id = row[JobInfoLoc.JOB_ID.value]
798
925
  run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
799
926
  run_timestamps[str(job_id)] = run_timestamp
800
- return common_utils.encode_payload(run_timestamps)
927
+ return message_utils.encode_payload(run_timestamps)
801
928
 
802
929
 
803
930
  class JobLibCodeGen:
@@ -812,14 +939,6 @@ class JobLibCodeGen:
812
939
  'import os',
813
940
  'import getpass',
814
941
  'from sky.skylet import job_lib, log_lib, constants',
815
- # Backward compatibility for old skylet lib version on the remote
816
- # machine. The `job_owner` argument was removed in #3037, and if the
817
- # remote machine has an old SkyPilot version before that, we need to
818
- # pass the `job_owner` argument to the job_lib functions.
819
- # TODO(zhwu): Remove this in 0.7.0 release.
820
- 'job_owner_kwargs = {} '
821
- 'if getattr(constants, "SKYLET_LIB_VERSION", 0) >= 1 '
822
- 'else {"job_owner": getpass.getuser()}',
823
942
  ]
824
943
 
825
944
  @classmethod
@@ -828,10 +947,17 @@ class JobLibCodeGen:
828
947
  if job_name is None:
829
948
  job_name = '-'
830
949
  code = [
831
- 'job_id = job_lib.add_job('
832
- f'{job_name!r}, '
833
- f'{username!r}, '
834
- f'{run_timestamp!r}, '
950
+ # We disallow job submission when SKYLET_VERSION is older than 9, as
951
+ # it was using ray job submit before #4318, and switched to raw
952
+ # process. Using the old skylet version will cause the job status
953
+ # to be stuck in PENDING state or transition to FAILED_DRIVER state.
954
+ '\nif int(constants.SKYLET_VERSION) < 9: '
955
+ 'raise RuntimeError("SkyPilot runtime is too old, which does not '
956
+ 'support submitting jobs.")',
957
+ '\njob_id = job_lib.add_job('
958
+ f'{job_name!r},'
959
+ f'{username!r},'
960
+ f'{run_timestamp!r},'
835
961
  f'{resources_str!r})',
836
962
  'print("Job ID: " + str(job_id), flush=True)',
837
963
  ]
@@ -839,35 +965,49 @@ class JobLibCodeGen:
839
965
 
840
966
  @classmethod
841
967
  def queue_job(cls, job_id: int, cmd: str) -> str:
842
- code = ['job_lib.scheduler.queue('
843
- f'{job_id!r},'
844
- f'{cmd!r})']
968
+ code = [
969
+ 'job_lib.scheduler.queue('
970
+ f'{job_id!r},'
971
+ f'{cmd!r})',
972
+ ]
845
973
  return cls._build(code)
846
974
 
847
975
  @classmethod
848
976
  def update_status(cls) -> str:
849
- code = ['job_lib.update_status(**job_owner_kwargs)']
977
+ code = ['job_lib.update_status()']
850
978
  return cls._build(code)
851
979
 
852
980
  @classmethod
853
- def get_job_queue(cls, username: Optional[str], all_jobs: bool) -> str:
981
+ def get_job_queue(cls, user_hash: Optional[str], all_jobs: bool) -> str:
982
+ # TODO(SKY-1214): combine get_job_queue with get_job_statuses.
854
983
  code = [
855
984
  'job_queue = job_lib.dump_job_queue('
856
- f'{username!r}, {all_jobs})', 'print(job_queue, flush=True)'
985
+ f'{user_hash!r}, {all_jobs})',
986
+ 'print(job_queue, flush=True)',
857
987
  ]
858
988
  return cls._build(code)
859
989
 
860
990
  @classmethod
861
991
  def cancel_jobs(cls,
862
992
  job_ids: Optional[List[int]],
863
- cancel_all: bool = False) -> str:
993
+ cancel_all: bool = False,
994
+ user_hash: Optional[str] = None) -> str:
864
995
  """See job_lib.cancel_jobs()."""
865
996
  code = [
866
997
  (f'cancelled = job_lib.cancel_jobs_encoded_results('
867
- f' {job_ids!r}, {cancel_all}, **job_owner_kwargs)'),
998
+ f'jobs={job_ids!r}, cancel_all={cancel_all}, '
999
+ f'user_hash={user_hash!r})'),
868
1000
  # Print cancelled IDs. Caller should parse by decoding.
869
1001
  'print(cancelled, flush=True)',
870
1002
  ]
1003
+ # TODO(zhwu): Backward compatibility, remove after 0.9.0.
1004
+ if user_hash is None:
1005
+ code = [
1006
+ (f'cancelled = job_lib.cancel_jobs_encoded_results('
1007
+ f' {job_ids!r}, {cancel_all})'),
1008
+ # Print cancelled IDs. Caller should parse by decoding.
1009
+ 'print(cancelled, flush=True)',
1010
+ ]
871
1011
  return cls._build(code)
872
1012
 
873
1013
  @classmethod
@@ -880,14 +1020,19 @@ class JobLibCodeGen:
880
1020
  def tail_logs(cls,
881
1021
  job_id: Optional[int],
882
1022
  managed_job_id: Optional[int],
883
- follow: bool = True) -> str:
1023
+ follow: bool = True,
1024
+ tail: int = 0) -> str:
884
1025
  # pylint: disable=line-too-long
1026
+
885
1027
  code = [
1028
+ # We use != instead of is not because 1 is not None will print a warning:
1029
+ # <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
886
1030
  f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
887
1031
  'run_timestamp = job_lib.get_run_timestamp(job_id)',
888
1032
  f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
889
- f'log_lib.tail_logs(job_id=job_id, log_dir=log_dir, '
890
- f'managed_job_id={managed_job_id!r}, follow={follow}, **job_owner_kwargs)',
1033
+ f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1034
+ f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
1035
+ f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
891
1036
  ]
892
1037
  return cls._build(code)
893
1038