skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250624__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +12 -0
  5. sky/backends/cloud_vm_ray_backend.py +36 -13
  6. sky/client/cli/command.py +42 -21
  7. sky/client/sdk.py +12 -6
  8. sky/clouds/kubernetes.py +1 -0
  9. sky/core.py +88 -15
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +6 -0
  12. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/938-ce7991c156584b06.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-ce31493da9747ef4.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +6 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +6 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-cf490d1fa38f3740.js → [job]-171c27f4ca94861c.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-ecc5a7003776cfa7.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +1 -0
  32. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +3 -0
  33. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +1 -0
  34. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  35. sky/dashboard/out/clusters/[cluster].html +1 -1
  36. sky/dashboard/out/clusters.html +1 -1
  37. sky/dashboard/out/config.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra/[context].html +1 -1
  40. sky/dashboard/out/infra.html +1 -1
  41. sky/dashboard/out/jobs/[job].html +1 -1
  42. sky/dashboard/out/jobs.html +1 -1
  43. sky/dashboard/out/users.html +1 -1
  44. sky/dashboard/out/workspace/new.html +1 -1
  45. sky/dashboard/out/workspaces/[name].html +1 -1
  46. sky/dashboard/out/workspaces.html +1 -1
  47. sky/exceptions.py +11 -0
  48. sky/global_user_state.py +134 -20
  49. sky/jobs/client/sdk.py +0 -1
  50. sky/jobs/controller.py +5 -1
  51. sky/jobs/scheduler.py +4 -3
  52. sky/jobs/server/core.py +117 -51
  53. sky/jobs/state.py +15 -0
  54. sky/jobs/utils.py +114 -8
  55. sky/resources.py +1 -1
  56. sky/server/requests/payloads.py +6 -3
  57. sky/server/requests/requests.py +24 -1
  58. sky/server/server.py +4 -3
  59. sky/skylet/constants.py +5 -11
  60. sky/task.py +1 -26
  61. sky/templates/jobs-controller.yaml.j2 +12 -1
  62. sky/templates/kubernetes-ray.yml.j2 +1 -1
  63. sky/utils/admin_policy_utils.py +5 -1
  64. sky/utils/cli_utils/status_utils.py +25 -17
  65. sky/utils/command_runner.py +118 -12
  66. sky/utils/command_runner.pyi +57 -0
  67. sky/utils/common_utils.py +9 -1
  68. sky/utils/controller_utils.py +1 -2
  69. sky/utils/schemas.py +34 -35
  70. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/METADATA +1 -1
  71. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/RECORD +78 -77
  72. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  73. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  74. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  76. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  78. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  79. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  80. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  81. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  87. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  89. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  90. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-bde186946d353355.js} +0 -0
  91. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-56412c7976b4655b.js} +0 -0
  92. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → zsALxITkbP8J8NVwSDwMo}/_ssgManifest.js +0 -0
  93. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/WHEEL +0 -0
  94. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/entry_points.txt +0 -0
  95. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/licenses/LICENSE +0 -0
  96. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -134,6 +134,12 @@ cluster_history_table = sqlalchemy.Table(
134
134
  sqlalchemy.Column('launched_resources', sqlalchemy.LargeBinary),
135
135
  sqlalchemy.Column('usage_intervals', sqlalchemy.LargeBinary),
136
136
  sqlalchemy.Column('user_hash', sqlalchemy.Text),
137
+ sqlalchemy.Column('last_creation_yaml',
138
+ sqlalchemy.Text,
139
+ server_default=None),
140
+ sqlalchemy.Column('last_creation_command',
141
+ sqlalchemy.Text,
142
+ server_default=None),
137
143
  )
138
144
 
139
145
  ssh_key_table = sqlalchemy.Table(
@@ -308,6 +314,21 @@ def create_table():
308
314
  'password',
309
315
  sqlalchemy.Text(),
310
316
  default_statement='DEFAULT NULL')
317
+
318
+ db_utils.add_column_to_table_sqlalchemy(
319
+ session,
320
+ 'cluster_history',
321
+ 'last_creation_yaml',
322
+ sqlalchemy.Text(),
323
+ default_statement='DEFAULT NULL')
324
+
325
+ db_utils.add_column_to_table_sqlalchemy(
326
+ session,
327
+ 'cluster_history',
328
+ 'last_creation_command',
329
+ sqlalchemy.Text(),
330
+ default_statement='DEFAULT NULL')
331
+
311
332
  session.commit()
312
333
 
313
334
 
@@ -597,6 +618,14 @@ def add_or_update_cluster(cluster_name: str,
597
618
  # Modify cluster history table
598
619
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
599
620
  launched_resources = getattr(cluster_handle, 'launched_resources', None)
621
+ creation_info = {}
622
+ if conditional_values.get('last_creation_yaml') is not None:
623
+ creation_info = {
624
+ 'last_creation_yaml':
625
+ conditional_values.get('last_creation_yaml'),
626
+ 'last_creation_command':
627
+ conditional_values.get('last_creation_command'),
628
+ }
600
629
 
601
630
  insert_stmnt = insert_func(cluster_history_table).values(
602
631
  cluster_hash=cluster_hash,
@@ -605,7 +634,9 @@ def add_or_update_cluster(cluster_name: str,
605
634
  requested_resources=pickle.dumps(requested_resources),
606
635
  launched_resources=pickle.dumps(launched_resources),
607
636
  usage_intervals=pickle.dumps(usage_intervals),
608
- user_hash=user_hash)
637
+ user_hash=user_hash,
638
+ **creation_info,
639
+ )
609
640
  do_update_stmt = insert_stmnt.on_conflict_do_update(
610
641
  index_elements=[cluster_history_table.c.cluster_hash],
611
642
  set_={
@@ -617,7 +648,8 @@ def add_or_update_cluster(cluster_name: str,
617
648
  pickle.dumps(launched_resources),
618
649
  cluster_history_table.c.usage_intervals:
619
650
  pickle.dumps(usage_intervals),
620
- cluster_history_table.c.user_hash: user_hash
651
+ cluster_history_table.c.user_hash: user_hash,
652
+ **creation_info,
621
653
  })
622
654
  session.execute(do_update_stmt)
623
655
 
@@ -1027,40 +1059,122 @@ def get_clusters() -> List[Dict[str, Any]]:
1027
1059
 
1028
1060
 
1029
1061
  @_init_db
1030
- def get_clusters_from_history() -> List[Dict[str, Any]]:
1062
+ def get_clusters_from_history(
1063
+ days: Optional[int] = None) -> List[Dict[str, Any]]:
1064
+ """Get cluster reports from history.
1065
+
1066
+ Args:
1067
+ days: If specified, only include historical clusters (those not
1068
+ currently active) that were last used within the past 'days'
1069
+ days. Active clusters are always included regardless of this
1070
+ parameter.
1071
+
1072
+ Returns:
1073
+ List of cluster records with history information.
1074
+ """
1031
1075
  assert _SQLALCHEMY_ENGINE is not None
1032
1076
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1033
- rows = session.query(
1034
- cluster_history_table.join(cluster_table,
1035
- cluster_history_table.c.cluster_hash ==
1036
- cluster_table.c.cluster_hash,
1037
- isouter=True)).all()
1038
-
1039
- # '(cluster_hash, name, num_nodes, requested_resources, '
1040
- # 'launched_resources, usage_intervals) '
1077
+ # Explicitly select columns from both tables to avoid ambiguity
1078
+ query = session.query(
1079
+ cluster_history_table.c.cluster_hash, cluster_history_table.c.name,
1080
+ cluster_history_table.c.num_nodes,
1081
+ cluster_history_table.c.requested_resources,
1082
+ cluster_history_table.c.launched_resources,
1083
+ cluster_history_table.c.usage_intervals,
1084
+ cluster_history_table.c.user_hash,
1085
+ cluster_history_table.c.last_creation_yaml,
1086
+ cluster_history_table.c.last_creation_command,
1087
+ cluster_table.c.status, cluster_table.c.workspace,
1088
+ cluster_table.c.status_updated_at).select_from(
1089
+ cluster_history_table.join(cluster_table,
1090
+ cluster_history_table.c.cluster_hash
1091
+ == cluster_table.c.cluster_hash,
1092
+ isouter=True))
1093
+
1094
+ rows = query.all()
1095
+
1096
+ # Prepare filtering parameters
1097
+ cutoff_time = None
1098
+ if days is not None:
1099
+ cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
1100
+
1041
1101
  records = []
1042
1102
  for row in rows:
1043
- # TODO: use namedtuple instead of dict
1044
1103
  user_hash = _get_user_hash_or_current_user(row.user_hash)
1045
- status = row.status
1046
- if status is not None:
1047
- status = status_lib.ClusterStatus[status]
1104
+ launched_at = _get_cluster_launch_time(row.cluster_hash)
1105
+ duration = _get_cluster_duration(row.cluster_hash)
1106
+
1107
+ # Parse status
1108
+ status = None
1109
+ if row.status:
1110
+ status = status_lib.ClusterStatus[row.status]
1111
+
1112
+ # Apply filtering: always include active clusters, filter historical
1113
+ # ones by time
1114
+ if cutoff_time is not None and status is None: # Historical cluster
1115
+ # For historical clusters, check if they were used recently
1116
+ # Use the most recent activity from usage_intervals to determine
1117
+ # last use
1118
+ usage_intervals = []
1119
+ if row.usage_intervals:
1120
+ try:
1121
+ usage_intervals = pickle.loads(row.usage_intervals)
1122
+ except (pickle.PickleError, AttributeError):
1123
+ usage_intervals = []
1124
+
1125
+ # Find the most recent activity time from usage_intervals
1126
+ last_activity_time = None
1127
+ if usage_intervals:
1128
+ # Get the end time of the last interval (or start time if
1129
+ # still running)
1130
+ last_interval = usage_intervals[-1]
1131
+ last_activity_time = (last_interval[1] if last_interval[1]
1132
+ is not None else last_interval[0])
1133
+
1134
+ # Skip historical clusters that haven't been used recently
1135
+ if last_activity_time is None or last_activity_time < cutoff_time:
1136
+ continue
1137
+
1138
+ # Parse launched resources safely
1139
+ launched_resources = None
1140
+ if row.launched_resources:
1141
+ try:
1142
+ launched_resources = pickle.loads(row.launched_resources)
1143
+ except (pickle.PickleError, AttributeError):
1144
+ launched_resources = None
1145
+
1146
+ # Parse usage intervals safely
1147
+ usage_intervals = []
1148
+ if row.usage_intervals:
1149
+ try:
1150
+ usage_intervals = pickle.loads(row.usage_intervals)
1151
+ except (pickle.PickleError, AttributeError):
1152
+ usage_intervals = []
1153
+
1154
+ # Get user name from user hash
1155
+ user = get_user(user_hash)
1156
+ user_name = user.name if user is not None else None
1157
+
1048
1158
  record = {
1049
1159
  'name': row.name,
1050
- 'launched_at': _get_cluster_launch_time(row.cluster_hash),
1051
- 'duration': _get_cluster_duration(row.cluster_hash),
1160
+ 'launched_at': launched_at,
1161
+ 'duration': duration,
1052
1162
  'num_nodes': row.num_nodes,
1053
- 'resources': pickle.loads(row.launched_resources),
1163
+ 'resources': launched_resources,
1054
1164
  'cluster_hash': row.cluster_hash,
1055
- 'usage_intervals': pickle.loads(row.usage_intervals),
1165
+ 'usage_intervals': usage_intervals,
1056
1166
  'status': status,
1057
1167
  'user_hash': user_hash,
1168
+ 'user_name': user_name,
1169
+ 'workspace': row.workspace,
1170
+ 'last_creation_yaml': row.last_creation_yaml,
1171
+ 'last_creation_command': row.last_creation_command,
1058
1172
  }
1059
1173
 
1060
1174
  records.append(record)
1061
1175
 
1062
1176
  # sort by launch time, descending in recency
1063
- records = sorted(records, key=lambda record: -record['launched_at'])
1177
+ records = sorted(records, key=lambda record: -(record['launched_at'] or 0))
1064
1178
  return records
1065
1179
 
1066
1180
 
sky/jobs/client/sdk.py CHANGED
@@ -49,7 +49,6 @@ def launch(
49
49
  task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
50
50
  managed job.
51
51
  name: Name of the managed job.
52
- priority: Priority of the managed job.
53
52
  _need_confirmation: (Internal only) Whether to show a confirmation
54
53
  prompt before launching the job.
55
54
 
sky/jobs/controller.py CHANGED
@@ -603,7 +603,11 @@ def _cleanup(job_id: int, dag_yaml: str):
603
603
  # mounts.
604
604
  for file_mount in (task.file_mounts or {}).values():
605
605
  try:
606
- if not data_utils.is_cloud_store_url(file_mount):
606
+ # For consolidation mode, there is no two-hop file mounts
607
+ # and the file path here represents the real user data.
608
+ # We skip the cleanup for consolidation mode.
609
+ if (not data_utils.is_cloud_store_url(file_mount) and
610
+ not managed_job_utils.is_consolidation_mode()):
607
611
  path = os.path.expanduser(file_mount)
608
612
  if os.path.isdir(path):
609
613
  shutil.rmtree(path)
sky/jobs/scheduler.py CHANGED
@@ -40,6 +40,7 @@ from argparse import ArgumentParser
40
40
  import contextlib
41
41
  from functools import lru_cache
42
42
  import os
43
+ import sys
43
44
  import time
44
45
  import typing
45
46
 
@@ -89,12 +90,12 @@ def _start_controller(job_id: int, dag_yaml_path: str,
89
90
  activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
90
91
  source_environment_cmd = (f'source {env_file_path};'
91
92
  if env_file_path else '')
92
- run_controller_cmd = ('python -u -m sky.jobs.controller '
93
+ run_controller_cmd = (f'{sys.executable} -u -m sky.jobs.controller '
93
94
  f'{dag_yaml_path} --job-id {job_id};')
94
95
 
95
96
  # If the command line here is changed, please also update
96
- # utils._controller_process_alive. `--job-id X` should be at
97
- # the end.
97
+ # utils._controller_process_alive. The substring `--job-id X`
98
+ # should be in the command.
98
99
  run_cmd = (f'{activate_python_env_cmd}'
99
100
  f'{source_environment_cmd}'
100
101
  f'{run_controller_cmd}')
sky/jobs/server/core.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import os
3
+ import pathlib
3
4
  import tempfile
4
5
  import typing
5
6
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -20,6 +21,7 @@ from sky.backends import backend_utils
20
21
  from sky.catalog import common as service_catalog_common
21
22
  from sky.data import storage as storage_lib
22
23
  from sky.jobs import constants as managed_job_constants
24
+ from sky.jobs import state as managed_job_state
23
25
  from sky.jobs import utils as managed_job_utils
24
26
  from sky.provision import common as provision_common
25
27
  from sky.skylet import constants as skylet_constants
@@ -43,6 +45,72 @@ if typing.TYPE_CHECKING:
43
45
  logger = sky_logging.init_logger(__name__)
44
46
 
45
47
 
48
+ def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
49
+ """Maybe upload files to the controller.
50
+
51
+ In consolidation mode, we don't need to upload files to the controller as
52
+ the API server and the controller are colocated.
53
+ """
54
+ local_to_controller_file_mounts: Dict[str, str] = {}
55
+
56
+ if managed_job_utils.is_consolidation_mode():
57
+ return local_to_controller_file_mounts
58
+
59
+ if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
60
+ for task_ in dag.tasks:
61
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
62
+ task_, task_type='jobs')
63
+ else:
64
+ # We do not have any cloud storage available, so fall back to
65
+ # two-hop file_mount uploading.
66
+ # Note: we can't easily hack sync_storage_mounts() to upload
67
+ # directly to the controller, because the controller may not
68
+ # even be up yet.
69
+ for task_ in dag.tasks:
70
+ if task_.storage_mounts:
71
+ # Technically, we could convert COPY storage_mounts that
72
+ # have a local source and do not specify `store`, but we
73
+ # will not do that for now. Only plain file_mounts are
74
+ # supported.
75
+ raise exceptions.NotSupportedError(
76
+ 'Cloud-based file_mounts are specified, but no cloud '
77
+ 'storage is available. Please specify local '
78
+ 'file_mounts only.')
79
+
80
+ # Merge file mounts from all tasks.
81
+ local_to_controller_file_mounts.update(
82
+ controller_utils.translate_local_file_mounts_to_two_hop(task_))
83
+
84
+ return local_to_controller_file_mounts
85
+
86
+
87
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
88
+ """Submit the managed job locally if in consolidation mode.
89
+
90
+ In normal mode the managed job submission is done in the ray job submission.
91
+ For consolidation mode, we need to manually submit it. Check the following
92
+ function for the normal mode submission:
93
+ sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
94
+ _exec_code_on_head::_maybe_add_managed_job_code
95
+ """
96
+ if not managed_job_utils.is_consolidation_mode():
97
+ return None
98
+
99
+ # Create local directory for the managed job.
100
+ pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
101
+ consolidation_mode_job_id = managed_job_state.set_job_info_without_job_id(
102
+ dag.name,
103
+ workspace=skypilot_config.get_active_workspace(
104
+ force_user_workspace=True),
105
+ entrypoint=common_utils.get_current_command())
106
+ for task_id, task in enumerate(dag.tasks):
107
+ resources_str = backend_utils.get_task_resources_str(
108
+ task, is_managed_job=True)
109
+ managed_job_state.set_pending(consolidation_mode_job_id, task_id,
110
+ task.name, resources_str)
111
+ return consolidation_mode_job_id
112
+
113
+
46
114
  @timeline.event
47
115
  @usage_lib.entrypoint
48
116
  def launch(
@@ -103,7 +171,7 @@ def launch(
103
171
  'will be auto-generated) .')
104
172
  task_names.add(task_.name)
105
173
 
106
- # Check for priority in resources first, then fall back to job priority
174
+ # Check for priority in resources
107
175
  task_priority = None
108
176
  if task_.resources:
109
177
  # Convert set to list to access elements by index
@@ -121,20 +189,6 @@ def launch(
121
189
  f'{resource.priority} but expected {task_priority}.'
122
190
  )
123
191
 
124
- # Check for conflict between resources priority and job
125
- # priority
126
- if task_.job_priority is not None:
127
- with ux_utils.print_exception_no_traceback():
128
- raise ValueError(
129
- f'Task {task_.name!r}: Cannot specify both '
130
- f'resources.priority ({task_priority}) and '
131
- f'job.priority ({task_.job_priority}). Please use only '
132
- 'one priority specification method.')
133
-
134
- # Fall back to job priority if no resources priority found
135
- if task_priority is None:
136
- task_priority = task_.job_priority
137
-
138
192
  if task_priority is not None:
139
193
  if (priority is not None and priority != task_priority):
140
194
  with ux_utils.print_exception_no_traceback():
@@ -183,34 +237,7 @@ def launch(
183
237
  f'with:\n\n`sky down {cluster_name} --purge`\n\n'
184
238
  f'Reason: {common_utils.format_exception(e)}')
185
239
 
186
- local_to_controller_file_mounts = {}
187
-
188
- if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
189
- for task_ in dag.tasks:
190
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
191
- task_, task_type='jobs')
192
-
193
- else:
194
- # We do not have any cloud storage available, so fall back to
195
- # two-hop file_mount uploading.
196
- # Note: we can't easily hack sync_storage_mounts() to upload
197
- # directly to the controller, because the controller may not
198
- # even be up yet.
199
- for task_ in dag.tasks:
200
- if task_.storage_mounts:
201
- # Technically, we could convert COPY storage_mounts that
202
- # have a local source and do not specify `store`, but we
203
- # will not do that for now. Only plain file_mounts are
204
- # supported.
205
- raise exceptions.NotSupportedError(
206
- 'Cloud-based file_mounts are specified, but no cloud '
207
- 'storage is available. Please specify local '
208
- 'file_mounts only.')
209
-
210
- # Merge file mounts from all tasks.
211
- local_to_controller_file_mounts.update(
212
- controller_utils.translate_local_file_mounts_to_two_hop(
213
- task_))
240
+ local_to_controller_file_mounts = _maybe_upload_files_to_controller(dag)
214
241
 
215
242
  # Has to use `\` to avoid yapf issue.
216
243
  with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
@@ -233,6 +260,13 @@ def launch(
233
260
  controller=controller,
234
261
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
235
262
 
263
+ consolidation_mode_job_id = _maybe_submit_job_locally(prefix, dag)
264
+
265
+ # This is only needed for non-consolidation mode. For consolidation
266
+ # mode, the controller uses the same catalog as API server.
267
+ modified_catalogs = {} if consolidation_mode_job_id is not None else (
268
+ service_catalog_common.get_modified_catalog_file_mounts())
269
+
236
270
  vars_to_fill = {
237
271
  'remote_original_user_yaml_path': remote_original_user_yaml_path,
238
272
  'original_user_dag_path': original_user_yaml_path.name,
@@ -244,9 +278,9 @@ def launch(
244
278
  'dag_name': dag.name,
245
279
  'remote_user_config_path': remote_user_config_path,
246
280
  'remote_env_file_path': remote_env_file_path,
247
- 'modified_catalogs':
248
- service_catalog_common.get_modified_catalog_file_mounts(),
281
+ 'modified_catalogs': modified_catalogs,
249
282
  'priority': priority,
283
+ 'consolidation_mode_job_id': consolidation_mode_job_id,
250
284
  **controller_utils.shared_controller_vars_to_fill(
251
285
  controller,
252
286
  remote_user_config_path=remote_user_config_path,
@@ -285,12 +319,44 @@ def launch(
285
319
  # workspace A, but the controller is in workspace B, the
286
320
  # intermediate bucket and newly created bucket should be in
287
321
  # workspace A.
288
- return execution.launch(task=controller_task,
289
- cluster_name=controller_name,
290
- stream_logs=stream_logs,
291
- retry_until_up=True,
292
- fast=True,
293
- _disable_controller_check=True)
322
+ if consolidation_mode_job_id is None:
323
+ return execution.launch(task=controller_task,
324
+ cluster_name=controller_name,
325
+ stream_logs=stream_logs,
326
+ retry_until_up=True,
327
+ fast=True,
328
+ _disable_controller_check=True)
329
+ # Manually launch the scheduler process in consolidation mode.
330
+ local_handle = backend_utils.is_controller_accessible(
331
+ controller=controller, stopped_message='')
332
+ backend = backend_utils.get_backend_from_handle(local_handle)
333
+ assert isinstance(backend, backends.CloudVmRayBackend)
334
+ backend.sync_file_mounts(
335
+ handle=local_handle,
336
+ all_file_mounts=controller_task.file_mounts,
337
+ storage_mounts=controller_task.storage_mounts)
338
+ run_script = controller_task.run
339
+ assert isinstance(run_script, str)
340
+ # Manually add the env variables to the run script. Originally
341
+ # this is done in ray jobs submission but now we have to do it
342
+ # manually because there is no ray runtime on the API server.
343
+ env_cmds = [
344
+ f'export {k}={v!r}'
345
+ for k, v in controller_task.envs.items()
346
+ ]
347
+ run_script = '\n'.join(env_cmds + [run_script])
348
+ # Dump script for high availability recovery.
349
+ if controller_utils.high_availability_specified(
350
+ controller_name):
351
+ dump_script_path = (
352
+ managed_job_utils.get_ha_dump_script_path(
353
+ consolidation_mode_job_id))
354
+ dump_script_path.parent.mkdir(parents=True, exist_ok=True)
355
+ with open(dump_script_path, 'w',
356
+ encoding='utf-8') as script_f:
357
+ script_f.write(run_script)
358
+ backend.run_on_head(local_handle, run_script)
359
+ return consolidation_mode_job_id, local_handle
294
360
 
295
361
 
296
362
  def queue_from_kubernetes_pod(
sky/jobs/state.py CHANGED
@@ -463,6 +463,21 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
463
463
  entrypoint))
464
464
 
465
465
 
466
+ @_init_db
467
+ def set_job_info_without_job_id(name: str, workspace: str,
468
+ entrypoint: str) -> int:
469
+ assert _DB_PATH is not None
470
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
471
+ cursor.execute(
472
+ """\
473
+ INSERT INTO job_info
474
+ (name, schedule_state, workspace, entrypoint)
475
+ VALUES (?, ?, ?, ?)""",
476
+ (name, ManagedJobScheduleState.INACTIVE.value, workspace,
477
+ entrypoint))
478
+ return cursor.lastrowid
479
+
480
+
466
481
  @_init_db
467
482
  def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
468
483
  """Set the task to pending state."""