skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -16,15 +16,20 @@ import typing
16
16
  from typing import Any, Dict, List, Optional, Set, Tuple
17
17
  import uuid
18
18
 
19
- from sky import clouds
20
- from sky import status_lib
19
+ from sky import models
20
+ from sky import sky_logging
21
21
  from sky.utils import common_utils
22
22
  from sky.utils import db_utils
23
+ from sky.utils import registry
24
+ from sky.utils import status_lib
23
25
 
24
26
  if typing.TYPE_CHECKING:
25
27
  from sky import backends
28
+ from sky import clouds
26
29
  from sky.data import Storage
27
30
 
31
+ logger = sky_logging.init_logger(__name__)
32
+
28
33
  _ENABLED_CLOUDS_KEY = 'enabled_clouds'
29
34
 
30
35
  _DB_PATH = os.path.expanduser('~/.sky/state.db')
@@ -55,12 +60,15 @@ def create_table(cursor, conn):
55
60
  last_use TEXT,
56
61
  status TEXT,
57
62
  autostop INTEGER DEFAULT -1,
58
- metadata TEXT DEFAULT "{}",
63
+ metadata TEXT DEFAULT '{}',
59
64
  to_down INTEGER DEFAULT 0,
60
65
  owner TEXT DEFAULT null,
61
66
  cluster_hash TEXT DEFAULT null,
62
67
  storage_mounts_metadata BLOB DEFAULT null,
63
- cluster_ever_up INTEGER DEFAULT 0)""")
68
+ cluster_ever_up INTEGER DEFAULT 0,
69
+ status_updated_at INTEGER DEFAULT null,
70
+ config_hash TEXT DEFAULT null,
71
+ user_hash TEXT DEFAULT null)""")
64
72
 
65
73
  # Table for Cluster History
66
74
  # usage_intervals: List[Tuple[int, int]]
@@ -83,7 +91,8 @@ def create_table(cursor, conn):
83
91
  num_nodes int,
84
92
  requested_resources BLOB,
85
93
  launched_resources BLOB,
86
- usage_intervals BLOB)""")
94
+ usage_intervals BLOB,
95
+ user_hash TEXT)""")
87
96
  # Table for configs (e.g. enabled clouds)
88
97
  cursor.execute("""\
89
98
  CREATE TABLE IF NOT EXISTS config (
@@ -96,6 +105,11 @@ def create_table(cursor, conn):
96
105
  handle BLOB,
97
106
  last_use TEXT,
98
107
  status TEXT)""")
108
+ # Table for User
109
+ cursor.execute("""\
110
+ CREATE TABLE IF NOT EXISTS users (
111
+ id TEXT PRIMARY KEY,
112
+ name TEXT)""")
99
113
  # For backward compatibility.
100
114
  # TODO(zhwu): Remove this function after all users have migrated to
101
115
  # the latest version of SkyPilot.
@@ -104,11 +118,12 @@ def create_table(cursor, conn):
104
118
  'INTEGER DEFAULT -1')
105
119
 
106
120
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'metadata',
107
- 'TEXT DEFAULT "{}"')
121
+ 'TEXT DEFAULT \'{}\'')
108
122
 
109
123
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'to_down',
110
124
  'INTEGER DEFAULT 0')
111
125
 
126
+ # The cloud identity that created the cluster.
112
127
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'owner', 'TEXT')
113
128
 
114
129
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'cluster_hash',
@@ -130,17 +145,52 @@ def create_table(cursor, conn):
130
145
  # clusters were never really UP, setting it to 1 means they won't be
131
146
  # auto-deleted during any failover.
132
147
  value_to_replace_existing_entries=1)
148
+ db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
149
+ 'INTEGER DEFAULT null')
150
+ db_utils.add_column_to_table(
151
+ cursor,
152
+ conn,
153
+ 'clusters',
154
+ 'user_hash',
155
+ 'TEXT DEFAULT null',
156
+ value_to_replace_existing_entries=common_utils.get_user_hash())
157
+ db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
158
+ 'TEXT DEFAULT null')
159
+
160
+ db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
161
+ 'TEXT DEFAULT null')
162
+
163
+ db_utils.add_column_to_table(cursor, conn, 'cluster_history', 'user_hash',
164
+ 'TEXT DEFAULT null')
133
165
  conn.commit()
134
166
 
135
167
 
136
168
  _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
137
169
 
138
170
 
171
+ def add_or_update_user(user: models.User):
172
+ """Store the mapping from user hash to user name for display purposes."""
173
+ if user.name is None:
174
+ return
175
+ _DB.cursor.execute('INSERT OR REPLACE INTO users (id, name) VALUES (?, ?)',
176
+ (user.id, user.name))
177
+ _DB.conn.commit()
178
+
179
+
180
+ def get_user(user_id: str) -> models.User:
181
+ row = _DB.cursor.execute('SELECT id, name FROM users WHERE id=?',
182
+ (user_id,)).fetchone()
183
+ if row is None:
184
+ return models.User(id=user_id)
185
+ return models.User(id=row[0], name=row[1])
186
+
187
+
139
188
  def add_or_update_cluster(cluster_name: str,
140
189
  cluster_handle: 'backends.ResourceHandle',
141
190
  requested_resources: Optional[Set[Any]],
142
191
  ready: bool,
143
- is_launch: bool = True):
192
+ is_launch: bool = True,
193
+ config_hash: Optional[str] = None):
144
194
  """Adds or updates cluster_name -> cluster_handle mapping.
145
195
 
146
196
  Args:
@@ -155,10 +205,11 @@ def add_or_update_cluster(cluster_name: str,
155
205
  # FIXME: launched_at will be changed when `sky launch -c` is called.
156
206
  handle = pickle.dumps(cluster_handle)
157
207
  cluster_launched_at = int(time.time()) if is_launch else None
158
- last_use = common_utils.get_pretty_entry_point() if is_launch else None
208
+ last_use = common_utils.get_current_command() if is_launch else None
159
209
  status = status_lib.ClusterStatus.INIT
160
210
  if ready:
161
211
  status = status_lib.ClusterStatus.UP
212
+ status_updated_at = int(time.time())
162
213
 
163
214
  # TODO (sumanth): Cluster history table will have multiple entries
164
215
  # when the cluster failover through multiple regions (one entry per region).
@@ -183,6 +234,8 @@ def add_or_update_cluster(cluster_name: str,
183
234
  cluster_launched_at = int(time.time())
184
235
  usage_intervals.append((cluster_launched_at, None))
185
236
 
237
+ user_hash = common_utils.get_user_hash()
238
+
186
239
  _DB.cursor.execute(
187
240
  'INSERT or REPLACE INTO clusters'
188
241
  # All the fields need to exist here, even if they don't need
@@ -191,7 +244,8 @@ def add_or_update_cluster(cluster_name: str,
191
244
  # specified.
192
245
  '(name, launched_at, handle, last_use, status, '
193
246
  'autostop, to_down, metadata, owner, cluster_hash, '
194
- 'storage_mounts_metadata, cluster_ever_up) '
247
+ 'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
248
+ 'config_hash, user_hash) '
195
249
  'VALUES ('
196
250
  # name
197
251
  '?, '
@@ -217,7 +271,7 @@ def add_or_update_cluster(cluster_name: str,
217
271
  # Keep the old metadata value if it exists, otherwise set it to
218
272
  # default {}.
219
273
  'COALESCE('
220
- '(SELECT metadata FROM clusters WHERE name=?), "{}"),'
274
+ '(SELECT metadata FROM clusters WHERE name=?), \'{}\'),'
221
275
  # Keep the old owner value if it exists, otherwise set it to
222
276
  # default null.
223
277
  'COALESCE('
@@ -228,7 +282,14 @@ def add_or_update_cluster(cluster_name: str,
228
282
  'COALESCE('
229
283
  '(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
230
284
  # cluster_ever_up
231
- '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?)'
285
+ '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?), '
286
+ # status_updated_at
287
+ '?,'
288
+ # config_hash
289
+ 'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?)),'
290
+ # user_hash: keep original user_hash if it exists
291
+ 'COALESCE('
292
+ '(SELECT user_hash FROM clusters WHERE name=?), ?)'
232
293
  ')',
233
294
  (
234
295
  # name
@@ -260,6 +321,14 @@ def add_or_update_cluster(cluster_name: str,
260
321
  # cluster_ever_up
261
322
  cluster_name,
262
323
  int(ready),
324
+ # status_updated_at
325
+ status_updated_at,
326
+ # config_hash
327
+ config_hash,
328
+ cluster_name,
329
+ # user_hash
330
+ cluster_name,
331
+ user_hash,
263
332
  ))
264
333
 
265
334
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -267,7 +336,7 @@ def add_or_update_cluster(cluster_name: str,
267
336
  _DB.cursor.execute(
268
337
  'INSERT or REPLACE INTO cluster_history'
269
338
  '(cluster_hash, name, num_nodes, requested_resources, '
270
- 'launched_resources, usage_intervals) '
339
+ 'launched_resources, usage_intervals, user_hash) '
271
340
  'VALUES ('
272
341
  # hash
273
342
  '?, '
@@ -280,7 +349,10 @@ def add_or_update_cluster(cluster_name: str,
280
349
  # number of nodes
281
350
  '?, '
282
351
  # usage intervals
283
- '?)',
352
+ '?, '
353
+ # user_hash
354
+ '?'
355
+ ')',
284
356
  (
285
357
  # hash
286
358
  cluster_hash,
@@ -294,15 +366,37 @@ def add_or_update_cluster(cluster_name: str,
294
366
  pickle.dumps(launched_resources),
295
367
  # usage intervals
296
368
  pickle.dumps(usage_intervals),
369
+ # user_hash
370
+ user_hash,
297
371
  ))
298
372
 
299
373
  _DB.conn.commit()
300
374
 
301
375
 
376
+ def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
377
+ """Returns the user hash or the current user hash, if user_hash is None.
378
+
379
+ This is to ensure that the clusters created before the client-server
380
+ architecture (no user hash info previously) are associated with the current
381
+ user.
382
+ """
383
+ if user_hash is not None:
384
+ return user_hash
385
+ return common_utils.get_user_hash()
386
+
387
+
388
+ def update_cluster_handle(cluster_name: str,
389
+ cluster_handle: 'backends.ResourceHandle'):
390
+ handle = pickle.dumps(cluster_handle)
391
+ _DB.cursor.execute('UPDATE clusters SET handle=(?) WHERE name=(?)',
392
+ (handle, cluster_name))
393
+ _DB.conn.commit()
394
+
395
+
302
396
  def update_last_use(cluster_name: str):
303
397
  """Updates the last used command for the cluster."""
304
398
  _DB.cursor.execute('UPDATE clusters SET last_use=(?) WHERE name=(?)',
305
- (common_utils.get_pretty_entry_point(), cluster_name))
399
+ (common_utils.get_current_command(), cluster_name))
306
400
  _DB.conn.commit()
307
401
 
308
402
 
@@ -330,11 +424,13 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
330
424
  # stopped VM, which leads to timeout.
331
425
  if hasattr(handle, 'stable_internal_external_ips'):
332
426
  handle.stable_internal_external_ips = None
427
+ current_time = int(time.time())
333
428
  _DB.cursor.execute(
334
- 'UPDATE clusters SET handle=(?), status=(?) '
335
- 'WHERE name=(?)', (
429
+ 'UPDATE clusters SET handle=(?), status=(?), '
430
+ 'status_updated_at=(?) WHERE name=(?)', (
336
431
  pickle.dumps(handle),
337
432
  status_lib.ClusterStatus.STOPPED.value,
433
+ current_time,
338
434
  cluster_name,
339
435
  ))
340
436
  _DB.conn.commit()
@@ -359,10 +455,10 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]:
359
455
 
360
456
  def set_cluster_status(cluster_name: str,
361
457
  status: status_lib.ClusterStatus) -> None:
362
- _DB.cursor.execute('UPDATE clusters SET status=(?) WHERE name=(?)', (
363
- status.value,
364
- cluster_name,
365
- ))
458
+ current_time = int(time.time())
459
+ _DB.cursor.execute(
460
+ 'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
461
+ (status.value, current_time, cluster_name))
366
462
  count = _DB.cursor.rowcount
367
463
  _DB.conn.commit()
368
464
  assert count <= 1, count
@@ -570,15 +666,19 @@ def _load_storage_mounts_metadata(
570
666
 
571
667
  def get_cluster_from_name(
572
668
  cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
573
- rows = _DB.cursor.execute('SELECT * FROM clusters WHERE name=(?)',
574
- (cluster_name,)).fetchall()
669
+ rows = _DB.cursor.execute(
670
+ 'SELECT name, launched_at, handle, last_use, status, autostop, '
671
+ 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
672
+ 'cluster_ever_up, status_updated_at, config_hash, user_hash '
673
+ 'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
575
674
  for row in rows:
576
675
  # Explicitly specify the number of fields to unpack, so that
577
676
  # we can add new fields to the database in the future without
578
677
  # breaking the previous code.
579
678
  (name, launched_at, handle, last_use, status, autostop, metadata,
580
- to_down, owner, cluster_hash, storage_mounts_metadata,
581
- cluster_ever_up) = row[:12]
679
+ to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
680
+ status_updated_at, config_hash, user_hash) = row
681
+ user_hash = _get_user_hash_or_current_user(user_hash)
582
682
  # TODO: use namedtuple instead of dict
583
683
  record = {
584
684
  'name': name,
@@ -594,6 +694,10 @@ def get_cluster_from_name(
594
694
  'storage_mounts_metadata':
595
695
  _load_storage_mounts_metadata(storage_mounts_metadata),
596
696
  'cluster_ever_up': bool(cluster_ever_up),
697
+ 'status_updated_at': status_updated_at,
698
+ 'user_hash': user_hash,
699
+ 'user_name': get_user(user_hash).name,
700
+ 'config_hash': config_hash,
597
701
  }
598
702
  return record
599
703
  return None
@@ -601,12 +705,16 @@ def get_cluster_from_name(
601
705
 
602
706
  def get_clusters() -> List[Dict[str, Any]]:
603
707
  rows = _DB.cursor.execute(
604
- 'select * from clusters order by launched_at desc').fetchall()
708
+ 'select name, launched_at, handle, last_use, status, autostop, '
709
+ 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
710
+ 'cluster_ever_up, status_updated_at, config_hash, user_hash '
711
+ 'from clusters order by launched_at desc').fetchall()
605
712
  records = []
606
713
  for row in rows:
607
714
  (name, launched_at, handle, last_use, status, autostop, metadata,
608
- to_down, owner, cluster_hash, storage_mounts_metadata,
609
- cluster_ever_up) = row[:12]
715
+ to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
716
+ status_updated_at, config_hash, user_hash) = row
717
+ user_hash = _get_user_hash_or_current_user(user_hash)
610
718
  # TODO: use namedtuple instead of dict
611
719
  record = {
612
720
  'name': name,
@@ -622,6 +730,10 @@ def get_clusters() -> List[Dict[str, Any]]:
622
730
  'storage_mounts_metadata':
623
731
  _load_storage_mounts_metadata(storage_mounts_metadata),
624
732
  'cluster_ever_up': bool(cluster_ever_up),
733
+ 'status_updated_at': status_updated_at,
734
+ 'user_hash': user_hash,
735
+ 'user_name': get_user(user_hash).name,
736
+ 'config_hash': config_hash,
625
737
  }
626
738
 
627
739
  records.append(record)
@@ -631,7 +743,8 @@ def get_clusters() -> List[Dict[str, Any]]:
631
743
  def get_clusters_from_history() -> List[Dict[str, Any]]:
632
744
  rows = _DB.cursor.execute(
633
745
  'SELECT ch.cluster_hash, ch.name, ch.num_nodes, '
634
- 'ch.launched_resources, ch.usage_intervals, clusters.status '
746
+ 'ch.launched_resources, ch.usage_intervals, clusters.status, '
747
+ 'ch.user_hash '
635
748
  'FROM cluster_history ch '
636
749
  'LEFT OUTER JOIN clusters '
637
750
  'ON ch.cluster_hash=clusters.cluster_hash ').fetchall()
@@ -650,7 +763,9 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
650
763
  launched_resources,
651
764
  usage_intervals,
652
765
  status,
653
- ) = row[:6]
766
+ user_hash,
767
+ ) = row[:7]
768
+ user_hash = _get_user_hash_or_current_user(user_hash)
654
769
 
655
770
  if status is not None:
656
771
  status = status_lib.ClusterStatus[status]
@@ -664,6 +779,7 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
664
779
  'cluster_hash': cluster_hash,
665
780
  'usage_intervals': pickle.loads(usage_intervals),
666
781
  'status': status,
782
+ 'user_hash': user_hash,
667
783
  }
668
784
 
669
785
  records.append(record)
@@ -679,17 +795,17 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]:
679
795
  return [row[0] for row in rows]
680
796
 
681
797
 
682
- def get_cached_enabled_clouds() -> List[clouds.Cloud]:
798
+ def get_cached_enabled_clouds() -> List['clouds.Cloud']:
683
799
  rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?',
684
800
  (_ENABLED_CLOUDS_KEY,))
685
801
  ret = []
686
802
  for (value,) in rows:
687
803
  ret = json.loads(value)
688
804
  break
689
- enabled_clouds: List[clouds.Cloud] = []
805
+ enabled_clouds: List['clouds.Cloud'] = []
690
806
  for c in ret:
691
807
  try:
692
- cloud = clouds.CLOUD_REGISTRY.from_str(c)
808
+ cloud = registry.CLOUD_REGISTRY.from_str(c)
693
809
  except ValueError:
694
810
  # Handle the case for the clouds whose support has been removed from
695
811
  # SkyPilot, e.g., 'local' was a cloud in the past and may be stored
@@ -712,7 +828,7 @@ def add_or_update_storage(storage_name: str,
712
828
  storage_status: status_lib.StorageStatus):
713
829
  storage_launched_at = int(time.time())
714
830
  handle = pickle.dumps(storage_handle)
715
- last_use = common_utils.get_pretty_entry_point()
831
+ last_use = common_utils.get_current_command()
716
832
 
717
833
  def status_check(status):
718
834
  return status in status_lib.StorageStatus
@@ -794,7 +910,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
794
910
 
795
911
 
796
912
  def get_storage() -> List[Dict[str, Any]]:
797
- rows = _DB.cursor.execute('select * from storage')
913
+ rows = _DB.cursor.execute('SELECT * FROM storage')
798
914
  records = []
799
915
  for name, launched_at, handle, last_use, status in rows:
800
916
  # TODO: use namedtuple instead of dict
sky/jobs/__init__.py CHANGED
@@ -1,33 +1,32 @@
1
1
  """Managed jobs."""
2
2
  import pathlib
3
3
 
4
+ from sky.jobs.client.sdk import cancel
5
+ from sky.jobs.client.sdk import dashboard
6
+ from sky.jobs.client.sdk import download_logs
7
+ from sky.jobs.client.sdk import launch
8
+ from sky.jobs.client.sdk import queue
9
+ from sky.jobs.client.sdk import tail_logs
4
10
  from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
11
+ from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
5
12
  from sky.jobs.constants import JOBS_CONTROLLER_TEMPLATE
6
13
  from sky.jobs.constants import JOBS_CONTROLLER_YAML_PREFIX
7
14
  from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
8
- from sky.jobs.core import cancel
9
- from sky.jobs.core import launch
10
- from sky.jobs.core import queue
11
- from sky.jobs.core import tail_logs
12
- from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
13
- from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
15
+ from sky.jobs.recovery_strategy import StrategyExecutor
14
16
  from sky.jobs.state import ManagedJobStatus
15
17
  from sky.jobs.utils import dump_managed_job_queue
16
18
  from sky.jobs.utils import format_job_table
17
- from sky.jobs.utils import JOB_CONTROLLER_NAME
18
19
  from sky.jobs.utils import load_managed_job_queue
19
20
  from sky.jobs.utils import ManagedJobCodeGen
20
21
 
21
22
  pathlib.Path(JOBS_TASK_YAML_PREFIX).expanduser().parent.mkdir(parents=True,
22
23
  exist_ok=True)
23
24
  __all__ = [
24
- 'RECOVERY_STRATEGIES',
25
- 'DEFAULT_RECOVERY_STRATEGY',
26
- 'JOB_CONTROLLER_NAME',
27
25
  # Constants
28
26
  'JOBS_CONTROLLER_TEMPLATE',
29
27
  'JOBS_CONTROLLER_YAML_PREFIX',
30
28
  'JOBS_TASK_YAML_PREFIX',
29
+ 'JOBS_CONTROLLER_LOGS_DIR',
31
30
  # Enums
32
31
  'ManagedJobStatus',
33
32
  # Core
@@ -35,9 +34,12 @@ __all__ = [
35
34
  'launch',
36
35
  'queue',
37
36
  'tail_logs',
37
+ 'dashboard',
38
+ 'download_logs',
38
39
  # utils
39
40
  'ManagedJobCodeGen',
40
41
  'format_job_table',
41
42
  'dump_managed_job_queue',
42
43
  'load_managed_job_queue',
44
+ 'StrategyExecutor',
43
45
  ]
File without changes