skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -111,6 +111,23 @@ storage_table = sqlalchemy.Table(
111
111
  sqlalchemy.Column('status', sqlalchemy.Text),
112
112
  )
113
113
 
114
+ volume_table = sqlalchemy.Table(
115
+ 'volumes',
116
+ Base.metadata,
117
+ sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
118
+ sqlalchemy.Column('launched_at', sqlalchemy.Integer),
119
+ sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
120
+ sqlalchemy.Column('user_hash', sqlalchemy.Text, server_default=None),
121
+ sqlalchemy.Column('workspace',
122
+ sqlalchemy.Text,
123
+ server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
124
+ sqlalchemy.Column('last_attached_at',
125
+ sqlalchemy.Integer,
126
+ server_default=None),
127
+ sqlalchemy.Column('last_use', sqlalchemy.Text),
128
+ sqlalchemy.Column('status', sqlalchemy.Text),
129
+ )
130
+
114
131
  # Table for Cluster History
115
132
  # usage_intervals: List[Tuple[int, int]]
116
133
  # Specifies start and end timestamps of cluster.
@@ -134,6 +151,12 @@ cluster_history_table = sqlalchemy.Table(
134
151
  sqlalchemy.Column('launched_resources', sqlalchemy.LargeBinary),
135
152
  sqlalchemy.Column('usage_intervals', sqlalchemy.LargeBinary),
136
153
  sqlalchemy.Column('user_hash', sqlalchemy.Text),
154
+ sqlalchemy.Column('last_creation_yaml',
155
+ sqlalchemy.Text,
156
+ server_default=None),
157
+ sqlalchemy.Column('last_creation_command',
158
+ sqlalchemy.Text,
159
+ server_default=None),
137
160
  )
138
161
 
139
162
  ssh_key_table = sqlalchemy.Table(
@@ -308,6 +331,21 @@ def create_table():
308
331
  'password',
309
332
  sqlalchemy.Text(),
310
333
  default_statement='DEFAULT NULL')
334
+
335
+ db_utils.add_column_to_table_sqlalchemy(
336
+ session,
337
+ 'cluster_history',
338
+ 'last_creation_yaml',
339
+ sqlalchemy.Text(),
340
+ default_statement='DEFAULT NULL')
341
+
342
+ db_utils.add_column_to_table_sqlalchemy(
343
+ session,
344
+ 'cluster_history',
345
+ 'last_creation_command',
346
+ sqlalchemy.Text(),
347
+ default_statement='DEFAULT NULL')
348
+
311
349
  session.commit()
312
350
 
313
351
 
@@ -597,6 +635,14 @@ def add_or_update_cluster(cluster_name: str,
597
635
  # Modify cluster history table
598
636
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
599
637
  launched_resources = getattr(cluster_handle, 'launched_resources', None)
638
+ creation_info = {}
639
+ if conditional_values.get('last_creation_yaml') is not None:
640
+ creation_info = {
641
+ 'last_creation_yaml':
642
+ conditional_values.get('last_creation_yaml'),
643
+ 'last_creation_command':
644
+ conditional_values.get('last_creation_command'),
645
+ }
600
646
 
601
647
  insert_stmnt = insert_func(cluster_history_table).values(
602
648
  cluster_hash=cluster_hash,
@@ -605,7 +651,9 @@ def add_or_update_cluster(cluster_name: str,
605
651
  requested_resources=pickle.dumps(requested_resources),
606
652
  launched_resources=pickle.dumps(launched_resources),
607
653
  usage_intervals=pickle.dumps(usage_intervals),
608
- user_hash=user_hash)
654
+ user_hash=user_hash,
655
+ **creation_info,
656
+ )
609
657
  do_update_stmt = insert_stmnt.on_conflict_do_update(
610
658
  index_elements=[cluster_history_table.c.cluster_hash],
611
659
  set_={
@@ -617,7 +665,8 @@ def add_or_update_cluster(cluster_name: str,
617
665
  pickle.dumps(launched_resources),
618
666
  cluster_history_table.c.usage_intervals:
619
667
  pickle.dumps(usage_intervals),
620
- cluster_history_table.c.user_hash: user_hash
668
+ cluster_history_table.c.user_hash: user_hash,
669
+ **creation_info,
621
670
  })
622
671
  session.execute(do_update_stmt)
623
672
 
@@ -1027,40 +1076,122 @@ def get_clusters() -> List[Dict[str, Any]]:
1027
1076
 
1028
1077
 
1029
1078
  @_init_db
1030
- def get_clusters_from_history() -> List[Dict[str, Any]]:
1079
+ def get_clusters_from_history(
1080
+ days: Optional[int] = None) -> List[Dict[str, Any]]:
1081
+ """Get cluster reports from history.
1082
+
1083
+ Args:
1084
+ days: If specified, only include historical clusters (those not
1085
+ currently active) that were last used within the past 'days'
1086
+ days. Active clusters are always included regardless of this
1087
+ parameter.
1088
+
1089
+ Returns:
1090
+ List of cluster records with history information.
1091
+ """
1031
1092
  assert _SQLALCHEMY_ENGINE is not None
1032
1093
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1033
- rows = session.query(
1034
- cluster_history_table.join(cluster_table,
1035
- cluster_history_table.c.cluster_hash ==
1036
- cluster_table.c.cluster_hash,
1037
- isouter=True)).all()
1038
-
1039
- # '(cluster_hash, name, num_nodes, requested_resources, '
1040
- # 'launched_resources, usage_intervals) '
1094
+ # Explicitly select columns from both tables to avoid ambiguity
1095
+ query = session.query(
1096
+ cluster_history_table.c.cluster_hash, cluster_history_table.c.name,
1097
+ cluster_history_table.c.num_nodes,
1098
+ cluster_history_table.c.requested_resources,
1099
+ cluster_history_table.c.launched_resources,
1100
+ cluster_history_table.c.usage_intervals,
1101
+ cluster_history_table.c.user_hash,
1102
+ cluster_history_table.c.last_creation_yaml,
1103
+ cluster_history_table.c.last_creation_command,
1104
+ cluster_table.c.status, cluster_table.c.workspace,
1105
+ cluster_table.c.status_updated_at).select_from(
1106
+ cluster_history_table.join(cluster_table,
1107
+ cluster_history_table.c.cluster_hash
1108
+ == cluster_table.c.cluster_hash,
1109
+ isouter=True))
1110
+
1111
+ rows = query.all()
1112
+
1113
+ # Prepare filtering parameters
1114
+ cutoff_time = None
1115
+ if days is not None:
1116
+ cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
1117
+
1041
1118
  records = []
1042
1119
  for row in rows:
1043
- # TODO: use namedtuple instead of dict
1044
1120
  user_hash = _get_user_hash_or_current_user(row.user_hash)
1045
- status = row.status
1046
- if status is not None:
1047
- status = status_lib.ClusterStatus[status]
1121
+ launched_at = _get_cluster_launch_time(row.cluster_hash)
1122
+ duration = _get_cluster_duration(row.cluster_hash)
1123
+
1124
+ # Parse status
1125
+ status = None
1126
+ if row.status:
1127
+ status = status_lib.ClusterStatus[row.status]
1128
+
1129
+ # Apply filtering: always include active clusters, filter historical
1130
+ # ones by time
1131
+ if cutoff_time is not None and status is None: # Historical cluster
1132
+ # For historical clusters, check if they were used recently
1133
+ # Use the most recent activity from usage_intervals to determine
1134
+ # last use
1135
+ usage_intervals = []
1136
+ if row.usage_intervals:
1137
+ try:
1138
+ usage_intervals = pickle.loads(row.usage_intervals)
1139
+ except (pickle.PickleError, AttributeError):
1140
+ usage_intervals = []
1141
+
1142
+ # Find the most recent activity time from usage_intervals
1143
+ last_activity_time = None
1144
+ if usage_intervals:
1145
+ # Get the end time of the last interval (or start time if
1146
+ # still running)
1147
+ last_interval = usage_intervals[-1]
1148
+ last_activity_time = (last_interval[1] if last_interval[1]
1149
+ is not None else last_interval[0])
1150
+
1151
+ # Skip historical clusters that haven't been used recently
1152
+ if last_activity_time is None or last_activity_time < cutoff_time:
1153
+ continue
1154
+
1155
+ # Parse launched resources safely
1156
+ launched_resources = None
1157
+ if row.launched_resources:
1158
+ try:
1159
+ launched_resources = pickle.loads(row.launched_resources)
1160
+ except (pickle.PickleError, AttributeError):
1161
+ launched_resources = None
1162
+
1163
+ # Parse usage intervals safely
1164
+ usage_intervals = []
1165
+ if row.usage_intervals:
1166
+ try:
1167
+ usage_intervals = pickle.loads(row.usage_intervals)
1168
+ except (pickle.PickleError, AttributeError):
1169
+ usage_intervals = []
1170
+
1171
+ # Get user name from user hash
1172
+ user = get_user(user_hash)
1173
+ user_name = user.name if user is not None else None
1174
+
1048
1175
  record = {
1049
1176
  'name': row.name,
1050
- 'launched_at': _get_cluster_launch_time(row.cluster_hash),
1051
- 'duration': _get_cluster_duration(row.cluster_hash),
1177
+ 'launched_at': launched_at,
1178
+ 'duration': duration,
1052
1179
  'num_nodes': row.num_nodes,
1053
- 'resources': pickle.loads(row.launched_resources),
1180
+ 'resources': launched_resources,
1054
1181
  'cluster_hash': row.cluster_hash,
1055
- 'usage_intervals': pickle.loads(row.usage_intervals),
1182
+ 'usage_intervals': usage_intervals,
1056
1183
  'status': status,
1057
1184
  'user_hash': user_hash,
1185
+ 'user_name': user_name,
1186
+ 'workspace': row.workspace,
1187
+ 'last_creation_yaml': row.last_creation_yaml,
1188
+ 'last_creation_command': row.last_creation_command,
1058
1189
  }
1059
1190
 
1060
1191
  records.append(record)
1061
1192
 
1062
1193
  # sort by launch time, descending in recency
1063
- records = sorted(records, key=lambda record: -record['launched_at'])
1194
+ records = sorted(records, key=lambda record: -(record['launched_at'] or 0))
1064
1195
  return records
1065
1196
 
1066
1197
 
@@ -1312,6 +1443,118 @@ def get_storage() -> List[Dict[str, Any]]:
1312
1443
  return records
1313
1444
 
1314
1445
 
1446
+ @_init_db
1447
+ def get_volume_names_start_with(starts_with: str) -> List[str]:
1448
+ assert _SQLALCHEMY_ENGINE is not None
1449
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1450
+ rows = session.query(volume_table).filter(
1451
+ volume_table.c.name.like(f'{starts_with}%')).all()
1452
+ return [row.name for row in rows]
1453
+
1454
+
1455
+ @_init_db
1456
+ def get_volumes() -> List[Dict[str, Any]]:
1457
+ assert _SQLALCHEMY_ENGINE is not None
1458
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1459
+ rows = session.query(volume_table).all()
1460
+ records = []
1461
+ for row in rows:
1462
+ records.append({
1463
+ 'name': row.name,
1464
+ 'launched_at': row.launched_at,
1465
+ 'handle': pickle.loads(row.handle),
1466
+ 'user_hash': row.user_hash,
1467
+ 'workspace': row.workspace,
1468
+ 'last_attached_at': row.last_attached_at,
1469
+ 'last_use': row.last_use,
1470
+ 'status': status_lib.VolumeStatus[row.status],
1471
+ })
1472
+ return records
1473
+
1474
+
1475
+ @_init_db
1476
+ def get_volume_by_name(name: str) -> Optional[Dict[str, Any]]:
1477
+ assert _SQLALCHEMY_ENGINE is not None
1478
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1479
+ row = session.query(volume_table).filter_by(name=name).first()
1480
+ if row:
1481
+ return {
1482
+ 'name': row.name,
1483
+ 'launched_at': row.launched_at,
1484
+ 'handle': pickle.loads(row.handle),
1485
+ 'user_hash': row.user_hash,
1486
+ 'workspace': row.workspace,
1487
+ 'last_attached_at': row.last_attached_at,
1488
+ 'last_use': row.last_use,
1489
+ 'status': status_lib.VolumeStatus[row.status],
1490
+ }
1491
+ return None
1492
+
1493
+
1494
+ @_init_db
1495
+ def add_volume(name: str, config: models.VolumeConfig,
1496
+ status: status_lib.VolumeStatus) -> None:
1497
+ assert _SQLALCHEMY_ENGINE is not None
1498
+ volume_launched_at = int(time.time())
1499
+ handle = pickle.dumps(config)
1500
+ last_use = common_utils.get_current_command()
1501
+ user_hash = common_utils.get_current_user().id
1502
+ active_workspace = skypilot_config.get_active_workspace()
1503
+
1504
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1505
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
1506
+ db_utils.SQLAlchemyDialect.SQLITE.value):
1507
+ insert_func = sqlite.insert
1508
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
1509
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1510
+ insert_func = postgresql.insert
1511
+ else:
1512
+ raise ValueError('Unsupported database dialect')
1513
+ insert_stmnt = insert_func(volume_table).values(
1514
+ name=name,
1515
+ launched_at=volume_launched_at,
1516
+ handle=handle,
1517
+ user_hash=user_hash,
1518
+ workspace=active_workspace,
1519
+ last_attached_at=None,
1520
+ last_use=last_use,
1521
+ status=status.value,
1522
+ )
1523
+ do_update_stmt = insert_stmnt.on_conflict_do_nothing()
1524
+ session.execute(do_update_stmt)
1525
+ session.commit()
1526
+
1527
+
1528
+ @_init_db
1529
+ def update_volume(name: str, last_attached_at: int,
1530
+ status: status_lib.VolumeStatus) -> None:
1531
+ assert _SQLALCHEMY_ENGINE is not None
1532
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1533
+ session.query(volume_table).filter_by(name=name).update({
1534
+ volume_table.c.last_attached_at: last_attached_at,
1535
+ volume_table.c.status: status.value,
1536
+ })
1537
+ session.commit()
1538
+
1539
+
1540
+ @_init_db
1541
+ def update_volume_status(name: str, status: status_lib.VolumeStatus) -> None:
1542
+ assert _SQLALCHEMY_ENGINE is not None
1543
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1544
+ session.query(volume_table).filter_by(name=name).update({
1545
+ volume_table.c.status: status.value,
1546
+ })
1547
+ session.commit()
1548
+
1549
+
1550
+ @_init_db
1551
+ def delete_volume(name: str) -> None:
1552
+ assert _SQLALCHEMY_ENGINE is not None
1553
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1554
+ session.query(volume_table).filter_by(name=name).delete()
1555
+ session.commit()
1556
+
1557
+
1315
1558
  @_init_db
1316
1559
  def get_ssh_keys(user_hash: str) -> Tuple[str, str, bool]:
1317
1560
  assert _SQLALCHEMY_ENGINE is not None
sky/jobs/client/sdk.py CHANGED
@@ -7,10 +7,10 @@ import webbrowser
7
7
  import click
8
8
 
9
9
  from sky import sky_logging
10
- from sky.adaptors import common as adaptors_common
11
10
  from sky.client import common as client_common
12
11
  from sky.client import sdk
13
12
  from sky.server import common as server_common
13
+ from sky.server import rest
14
14
  from sky.server.requests import payloads
15
15
  from sky.skylet import constants
16
16
  from sky.usage import usage_lib
@@ -22,11 +22,7 @@ from sky.utils import dag_utils
22
22
  if typing.TYPE_CHECKING:
23
23
  import io
24
24
 
25
- import requests
26
-
27
25
  import sky
28
- else:
29
- requests = adaptors_common.LazyImport('requests')
30
26
 
31
27
  logger = sky_logging.init_logger(__name__)
32
28
 
@@ -49,7 +45,6 @@ def launch(
49
45
  task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
50
46
  managed job.
51
47
  name: Name of the managed job.
52
- priority: Priority of the managed job.
53
48
  _need_confirmation: (Internal only) Whether to show a confirmation
54
49
  prompt before launching the job.
55
50
 
@@ -87,7 +82,7 @@ def launch(
87
82
  task=dag_str,
88
83
  name=name,
89
84
  )
90
- response = requests.post(
85
+ response = rest.post(
91
86
  f'{server_common.get_server_url()}/jobs/launch',
92
87
  json=json.loads(body.model_dump_json()),
93
88
  timeout=(5, None),
@@ -147,7 +142,7 @@ def queue(refresh: bool,
147
142
  all_users=all_users,
148
143
  job_ids=job_ids,
149
144
  )
150
- response = requests.post(
145
+ response = rest.post(
151
146
  f'{server_common.get_server_url()}/jobs/queue',
152
147
  json=json.loads(body.model_dump_json()),
153
148
  timeout=(5, None),
@@ -187,7 +182,7 @@ def cancel(
187
182
  all=all,
188
183
  all_users=all_users,
189
184
  )
190
- response = requests.post(
185
+ response = rest.post(
191
186
  f'{server_common.get_server_url()}/jobs/cancel',
192
187
  json=json.loads(body.model_dump_json()),
193
188
  timeout=(5, None),
@@ -198,6 +193,7 @@ def cancel(
198
193
 
199
194
  @usage_lib.entrypoint
200
195
  @server_common.check_server_healthy_or_start
196
+ @rest.retry_on_server_unavailable()
201
197
  def tail_logs(name: Optional[str] = None,
202
198
  job_id: Optional[int] = None,
203
199
  follow: bool = True,
@@ -237,7 +233,7 @@ def tail_logs(name: Optional[str] = None,
237
233
  refresh=refresh,
238
234
  tail=tail,
239
235
  )
240
- response = requests.post(
236
+ response = rest.post(
241
237
  f'{server_common.get_server_url()}/jobs/logs',
242
238
  json=json.loads(body.model_dump_json()),
243
239
  stream=True,
@@ -245,7 +241,12 @@ def tail_logs(name: Optional[str] = None,
245
241
  cookies=server_common.get_api_cookie_jar(),
246
242
  )
247
243
  request_id = server_common.get_request_id(response)
248
- return sdk.stream_response(request_id, response, output_stream)
244
+ # Log request is idempotent when tail is 0, thus can resume previous
245
+ # streaming point on retry.
246
+ return sdk.stream_response(request_id=request_id,
247
+ response=response,
248
+ output_stream=output_stream,
249
+ resumable=(tail == 0))
249
250
 
250
251
 
251
252
  @usage_lib.entrypoint
@@ -282,7 +283,7 @@ def download_logs(
282
283
  controller=controller,
283
284
  local_dir=local_dir,
284
285
  )
285
- response = requests.post(
286
+ response = rest.post(
286
287
  f'{server_common.get_server_url()}/jobs/download_logs',
287
288
  json=json.loads(body.model_dump_json()),
288
289
  timeout=(5, None),
sky/jobs/controller.py CHANGED
@@ -603,7 +603,11 @@ def _cleanup(job_id: int, dag_yaml: str):
603
603
  # mounts.
604
604
  for file_mount in (task.file_mounts or {}).values():
605
605
  try:
606
- if not data_utils.is_cloud_store_url(file_mount):
606
+ # For consolidation mode, there is no two-hop file mounts
607
+ # and the file path here represents the real user data.
608
+ # We skip the cleanup for consolidation mode.
609
+ if (not data_utils.is_cloud_store_url(file_mount) and
610
+ not managed_job_utils.is_consolidation_mode()):
607
611
  path = os.path.expanduser(file_mount)
608
612
  if os.path.isdir(path):
609
613
  shutil.rmtree(path)
sky/jobs/scheduler.py CHANGED
@@ -40,6 +40,7 @@ from argparse import ArgumentParser
40
40
  import contextlib
41
41
  from functools import lru_cache
42
42
  import os
43
+ import sys
43
44
  import time
44
45
  import typing
45
46
 
@@ -89,12 +90,12 @@ def _start_controller(job_id: int, dag_yaml_path: str,
89
90
  activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
90
91
  source_environment_cmd = (f'source {env_file_path};'
91
92
  if env_file_path else '')
92
- run_controller_cmd = ('python -u -m sky.jobs.controller '
93
+ run_controller_cmd = (f'{sys.executable} -u -m sky.jobs.controller '
93
94
  f'{dag_yaml_path} --job-id {job_id};')
94
95
 
95
96
  # If the command line here is changed, please also update
96
- # utils._controller_process_alive. `--job-id X` should be at
97
- # the end.
97
+ # utils._controller_process_alive. The substring `--job-id X`
98
+ # should be in the command.
98
99
  run_cmd = (f'{activate_python_env_cmd}'
99
100
  f'{source_environment_cmd}'
100
101
  f'{run_controller_cmd}')