skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250624__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +12 -0
- sky/backends/cloud_vm_ray_backend.py +36 -13
- sky/client/cli/command.py +42 -21
- sky/client/sdk.py +12 -6
- sky/clouds/kubernetes.py +1 -0
- sky/core.py +88 -15
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-ce7991c156584b06.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-ce31493da9747ef4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-cf490d1fa38f3740.js → [job]-171c27f4ca94861c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-ecc5a7003776cfa7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +1 -0
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +3 -0
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +11 -0
- sky/global_user_state.py +134 -20
- sky/jobs/client/sdk.py +0 -1
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +117 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/resources.py +1 -1
- sky/server/requests/payloads.py +6 -3
- sky/server/requests/requests.py +24 -1
- sky/server/server.py +4 -3
- sky/skylet/constants.py +5 -11
- sky/task.py +1 -26
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/schemas.py +34 -35
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/RECORD +78 -77
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-bde186946d353355.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-56412c7976b4655b.js} +0 -0
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → zsALxITkbP8J8NVwSDwMo}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
@@ -134,6 +134,12 @@ cluster_history_table = sqlalchemy.Table(
|
|
134
134
|
sqlalchemy.Column('launched_resources', sqlalchemy.LargeBinary),
|
135
135
|
sqlalchemy.Column('usage_intervals', sqlalchemy.LargeBinary),
|
136
136
|
sqlalchemy.Column('user_hash', sqlalchemy.Text),
|
137
|
+
sqlalchemy.Column('last_creation_yaml',
|
138
|
+
sqlalchemy.Text,
|
139
|
+
server_default=None),
|
140
|
+
sqlalchemy.Column('last_creation_command',
|
141
|
+
sqlalchemy.Text,
|
142
|
+
server_default=None),
|
137
143
|
)
|
138
144
|
|
139
145
|
ssh_key_table = sqlalchemy.Table(
|
@@ -308,6 +314,21 @@ def create_table():
|
|
308
314
|
'password',
|
309
315
|
sqlalchemy.Text(),
|
310
316
|
default_statement='DEFAULT NULL')
|
317
|
+
|
318
|
+
db_utils.add_column_to_table_sqlalchemy(
|
319
|
+
session,
|
320
|
+
'cluster_history',
|
321
|
+
'last_creation_yaml',
|
322
|
+
sqlalchemy.Text(),
|
323
|
+
default_statement='DEFAULT NULL')
|
324
|
+
|
325
|
+
db_utils.add_column_to_table_sqlalchemy(
|
326
|
+
session,
|
327
|
+
'cluster_history',
|
328
|
+
'last_creation_command',
|
329
|
+
sqlalchemy.Text(),
|
330
|
+
default_statement='DEFAULT NULL')
|
331
|
+
|
311
332
|
session.commit()
|
312
333
|
|
313
334
|
|
@@ -597,6 +618,14 @@ def add_or_update_cluster(cluster_name: str,
|
|
597
618
|
# Modify cluster history table
|
598
619
|
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
599
620
|
launched_resources = getattr(cluster_handle, 'launched_resources', None)
|
621
|
+
creation_info = {}
|
622
|
+
if conditional_values.get('last_creation_yaml') is not None:
|
623
|
+
creation_info = {
|
624
|
+
'last_creation_yaml':
|
625
|
+
conditional_values.get('last_creation_yaml'),
|
626
|
+
'last_creation_command':
|
627
|
+
conditional_values.get('last_creation_command'),
|
628
|
+
}
|
600
629
|
|
601
630
|
insert_stmnt = insert_func(cluster_history_table).values(
|
602
631
|
cluster_hash=cluster_hash,
|
@@ -605,7 +634,9 @@ def add_or_update_cluster(cluster_name: str,
|
|
605
634
|
requested_resources=pickle.dumps(requested_resources),
|
606
635
|
launched_resources=pickle.dumps(launched_resources),
|
607
636
|
usage_intervals=pickle.dumps(usage_intervals),
|
608
|
-
user_hash=user_hash
|
637
|
+
user_hash=user_hash,
|
638
|
+
**creation_info,
|
639
|
+
)
|
609
640
|
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
610
641
|
index_elements=[cluster_history_table.c.cluster_hash],
|
611
642
|
set_={
|
@@ -617,7 +648,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
617
648
|
pickle.dumps(launched_resources),
|
618
649
|
cluster_history_table.c.usage_intervals:
|
619
650
|
pickle.dumps(usage_intervals),
|
620
|
-
cluster_history_table.c.user_hash: user_hash
|
651
|
+
cluster_history_table.c.user_hash: user_hash,
|
652
|
+
**creation_info,
|
621
653
|
})
|
622
654
|
session.execute(do_update_stmt)
|
623
655
|
|
@@ -1027,40 +1059,122 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
1027
1059
|
|
1028
1060
|
|
1029
1061
|
@_init_db
|
1030
|
-
def get_clusters_from_history(
|
1062
|
+
def get_clusters_from_history(
|
1063
|
+
days: Optional[int] = None) -> List[Dict[str, Any]]:
|
1064
|
+
"""Get cluster reports from history.
|
1065
|
+
|
1066
|
+
Args:
|
1067
|
+
days: If specified, only include historical clusters (those not
|
1068
|
+
currently active) that were last used within the past 'days'
|
1069
|
+
days. Active clusters are always included regardless of this
|
1070
|
+
parameter.
|
1071
|
+
|
1072
|
+
Returns:
|
1073
|
+
List of cluster records with history information.
|
1074
|
+
"""
|
1031
1075
|
assert _SQLALCHEMY_ENGINE is not None
|
1032
1076
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1077
|
+
# Explicitly select columns from both tables to avoid ambiguity
|
1078
|
+
query = session.query(
|
1079
|
+
cluster_history_table.c.cluster_hash, cluster_history_table.c.name,
|
1080
|
+
cluster_history_table.c.num_nodes,
|
1081
|
+
cluster_history_table.c.requested_resources,
|
1082
|
+
cluster_history_table.c.launched_resources,
|
1083
|
+
cluster_history_table.c.usage_intervals,
|
1084
|
+
cluster_history_table.c.user_hash,
|
1085
|
+
cluster_history_table.c.last_creation_yaml,
|
1086
|
+
cluster_history_table.c.last_creation_command,
|
1087
|
+
cluster_table.c.status, cluster_table.c.workspace,
|
1088
|
+
cluster_table.c.status_updated_at).select_from(
|
1089
|
+
cluster_history_table.join(cluster_table,
|
1090
|
+
cluster_history_table.c.cluster_hash
|
1091
|
+
== cluster_table.c.cluster_hash,
|
1092
|
+
isouter=True))
|
1093
|
+
|
1094
|
+
rows = query.all()
|
1095
|
+
|
1096
|
+
# Prepare filtering parameters
|
1097
|
+
cutoff_time = None
|
1098
|
+
if days is not None:
|
1099
|
+
cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
|
1100
|
+
|
1041
1101
|
records = []
|
1042
1102
|
for row in rows:
|
1043
|
-
# TODO: use namedtuple instead of dict
|
1044
1103
|
user_hash = _get_user_hash_or_current_user(row.user_hash)
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1104
|
+
launched_at = _get_cluster_launch_time(row.cluster_hash)
|
1105
|
+
duration = _get_cluster_duration(row.cluster_hash)
|
1106
|
+
|
1107
|
+
# Parse status
|
1108
|
+
status = None
|
1109
|
+
if row.status:
|
1110
|
+
status = status_lib.ClusterStatus[row.status]
|
1111
|
+
|
1112
|
+
# Apply filtering: always include active clusters, filter historical
|
1113
|
+
# ones by time
|
1114
|
+
if cutoff_time is not None and status is None: # Historical cluster
|
1115
|
+
# For historical clusters, check if they were used recently
|
1116
|
+
# Use the most recent activity from usage_intervals to determine
|
1117
|
+
# last use
|
1118
|
+
usage_intervals = []
|
1119
|
+
if row.usage_intervals:
|
1120
|
+
try:
|
1121
|
+
usage_intervals = pickle.loads(row.usage_intervals)
|
1122
|
+
except (pickle.PickleError, AttributeError):
|
1123
|
+
usage_intervals = []
|
1124
|
+
|
1125
|
+
# Find the most recent activity time from usage_intervals
|
1126
|
+
last_activity_time = None
|
1127
|
+
if usage_intervals:
|
1128
|
+
# Get the end time of the last interval (or start time if
|
1129
|
+
# still running)
|
1130
|
+
last_interval = usage_intervals[-1]
|
1131
|
+
last_activity_time = (last_interval[1] if last_interval[1]
|
1132
|
+
is not None else last_interval[0])
|
1133
|
+
|
1134
|
+
# Skip historical clusters that haven't been used recently
|
1135
|
+
if last_activity_time is None or last_activity_time < cutoff_time:
|
1136
|
+
continue
|
1137
|
+
|
1138
|
+
# Parse launched resources safely
|
1139
|
+
launched_resources = None
|
1140
|
+
if row.launched_resources:
|
1141
|
+
try:
|
1142
|
+
launched_resources = pickle.loads(row.launched_resources)
|
1143
|
+
except (pickle.PickleError, AttributeError):
|
1144
|
+
launched_resources = None
|
1145
|
+
|
1146
|
+
# Parse usage intervals safely
|
1147
|
+
usage_intervals = []
|
1148
|
+
if row.usage_intervals:
|
1149
|
+
try:
|
1150
|
+
usage_intervals = pickle.loads(row.usage_intervals)
|
1151
|
+
except (pickle.PickleError, AttributeError):
|
1152
|
+
usage_intervals = []
|
1153
|
+
|
1154
|
+
# Get user name from user hash
|
1155
|
+
user = get_user(user_hash)
|
1156
|
+
user_name = user.name if user is not None else None
|
1157
|
+
|
1048
1158
|
record = {
|
1049
1159
|
'name': row.name,
|
1050
|
-
'launched_at':
|
1051
|
-
'duration':
|
1160
|
+
'launched_at': launched_at,
|
1161
|
+
'duration': duration,
|
1052
1162
|
'num_nodes': row.num_nodes,
|
1053
|
-
'resources':
|
1163
|
+
'resources': launched_resources,
|
1054
1164
|
'cluster_hash': row.cluster_hash,
|
1055
|
-
'usage_intervals':
|
1165
|
+
'usage_intervals': usage_intervals,
|
1056
1166
|
'status': status,
|
1057
1167
|
'user_hash': user_hash,
|
1168
|
+
'user_name': user_name,
|
1169
|
+
'workspace': row.workspace,
|
1170
|
+
'last_creation_yaml': row.last_creation_yaml,
|
1171
|
+
'last_creation_command': row.last_creation_command,
|
1058
1172
|
}
|
1059
1173
|
|
1060
1174
|
records.append(record)
|
1061
1175
|
|
1062
1176
|
# sort by launch time, descending in recency
|
1063
|
-
records = sorted(records, key=lambda record: -record['launched_at'])
|
1177
|
+
records = sorted(records, key=lambda record: -(record['launched_at'] or 0))
|
1064
1178
|
return records
|
1065
1179
|
|
1066
1180
|
|
sky/jobs/client/sdk.py
CHANGED
@@ -49,7 +49,6 @@ def launch(
|
|
49
49
|
task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
|
50
50
|
managed job.
|
51
51
|
name: Name of the managed job.
|
52
|
-
priority: Priority of the managed job.
|
53
52
|
_need_confirmation: (Internal only) Whether to show a confirmation
|
54
53
|
prompt before launching the job.
|
55
54
|
|
sky/jobs/controller.py
CHANGED
@@ -603,7 +603,11 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
603
603
|
# mounts.
|
604
604
|
for file_mount in (task.file_mounts or {}).values():
|
605
605
|
try:
|
606
|
-
|
606
|
+
# For consolidation mode, there is no two-hop file mounts
|
607
|
+
# and the file path here represents the real user data.
|
608
|
+
# We skip the cleanup for consolidation mode.
|
609
|
+
if (not data_utils.is_cloud_store_url(file_mount) and
|
610
|
+
not managed_job_utils.is_consolidation_mode()):
|
607
611
|
path = os.path.expanduser(file_mount)
|
608
612
|
if os.path.isdir(path):
|
609
613
|
shutil.rmtree(path)
|
sky/jobs/scheduler.py
CHANGED
@@ -40,6 +40,7 @@ from argparse import ArgumentParser
|
|
40
40
|
import contextlib
|
41
41
|
from functools import lru_cache
|
42
42
|
import os
|
43
|
+
import sys
|
43
44
|
import time
|
44
45
|
import typing
|
45
46
|
|
@@ -89,12 +90,12 @@ def _start_controller(job_id: int, dag_yaml_path: str,
|
|
89
90
|
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
90
91
|
source_environment_cmd = (f'source {env_file_path};'
|
91
92
|
if env_file_path else '')
|
92
|
-
run_controller_cmd = ('
|
93
|
+
run_controller_cmd = (f'{sys.executable} -u -m sky.jobs.controller '
|
93
94
|
f'{dag_yaml_path} --job-id {job_id};')
|
94
95
|
|
95
96
|
# If the command line here is changed, please also update
|
96
|
-
# utils._controller_process_alive. `--job-id X`
|
97
|
-
# the
|
97
|
+
# utils._controller_process_alive. The substring `--job-id X`
|
98
|
+
# should be in the command.
|
98
99
|
run_cmd = (f'{activate_python_env_cmd}'
|
99
100
|
f'{source_environment_cmd}'
|
100
101
|
f'{run_controller_cmd}')
|
sky/jobs/server/core.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
"""SDK functions for managed jobs."""
|
2
2
|
import os
|
3
|
+
import pathlib
|
3
4
|
import tempfile
|
4
5
|
import typing
|
5
6
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
@@ -20,6 +21,7 @@ from sky.backends import backend_utils
|
|
20
21
|
from sky.catalog import common as service_catalog_common
|
21
22
|
from sky.data import storage as storage_lib
|
22
23
|
from sky.jobs import constants as managed_job_constants
|
24
|
+
from sky.jobs import state as managed_job_state
|
23
25
|
from sky.jobs import utils as managed_job_utils
|
24
26
|
from sky.provision import common as provision_common
|
25
27
|
from sky.skylet import constants as skylet_constants
|
@@ -43,6 +45,72 @@ if typing.TYPE_CHECKING:
|
|
43
45
|
logger = sky_logging.init_logger(__name__)
|
44
46
|
|
45
47
|
|
48
|
+
def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
49
|
+
"""Maybe upload files to the controller.
|
50
|
+
|
51
|
+
In consolidation mode, we don't need to upload files to the controller as
|
52
|
+
the API server and the controller are colocated.
|
53
|
+
"""
|
54
|
+
local_to_controller_file_mounts: Dict[str, str] = {}
|
55
|
+
|
56
|
+
if managed_job_utils.is_consolidation_mode():
|
57
|
+
return local_to_controller_file_mounts
|
58
|
+
|
59
|
+
if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
|
60
|
+
for task_ in dag.tasks:
|
61
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
62
|
+
task_, task_type='jobs')
|
63
|
+
else:
|
64
|
+
# We do not have any cloud storage available, so fall back to
|
65
|
+
# two-hop file_mount uploading.
|
66
|
+
# Note: we can't easily hack sync_storage_mounts() to upload
|
67
|
+
# directly to the controller, because the controller may not
|
68
|
+
# even be up yet.
|
69
|
+
for task_ in dag.tasks:
|
70
|
+
if task_.storage_mounts:
|
71
|
+
# Technically, we could convert COPY storage_mounts that
|
72
|
+
# have a local source and do not specify `store`, but we
|
73
|
+
# will not do that for now. Only plain file_mounts are
|
74
|
+
# supported.
|
75
|
+
raise exceptions.NotSupportedError(
|
76
|
+
'Cloud-based file_mounts are specified, but no cloud '
|
77
|
+
'storage is available. Please specify local '
|
78
|
+
'file_mounts only.')
|
79
|
+
|
80
|
+
# Merge file mounts from all tasks.
|
81
|
+
local_to_controller_file_mounts.update(
|
82
|
+
controller_utils.translate_local_file_mounts_to_two_hop(task_))
|
83
|
+
|
84
|
+
return local_to_controller_file_mounts
|
85
|
+
|
86
|
+
|
87
|
+
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
|
88
|
+
"""Submit the managed job locally if in consolidation mode.
|
89
|
+
|
90
|
+
In normal mode the managed job submission is done in the ray job submission.
|
91
|
+
For consolidation mode, we need to manually submit it. Check the following
|
92
|
+
function for the normal mode submission:
|
93
|
+
sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
|
94
|
+
_exec_code_on_head::_maybe_add_managed_job_code
|
95
|
+
"""
|
96
|
+
if not managed_job_utils.is_consolidation_mode():
|
97
|
+
return None
|
98
|
+
|
99
|
+
# Create local directory for the managed job.
|
100
|
+
pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
|
101
|
+
consolidation_mode_job_id = managed_job_state.set_job_info_without_job_id(
|
102
|
+
dag.name,
|
103
|
+
workspace=skypilot_config.get_active_workspace(
|
104
|
+
force_user_workspace=True),
|
105
|
+
entrypoint=common_utils.get_current_command())
|
106
|
+
for task_id, task in enumerate(dag.tasks):
|
107
|
+
resources_str = backend_utils.get_task_resources_str(
|
108
|
+
task, is_managed_job=True)
|
109
|
+
managed_job_state.set_pending(consolidation_mode_job_id, task_id,
|
110
|
+
task.name, resources_str)
|
111
|
+
return consolidation_mode_job_id
|
112
|
+
|
113
|
+
|
46
114
|
@timeline.event
|
47
115
|
@usage_lib.entrypoint
|
48
116
|
def launch(
|
@@ -103,7 +171,7 @@ def launch(
|
|
103
171
|
'will be auto-generated) .')
|
104
172
|
task_names.add(task_.name)
|
105
173
|
|
106
|
-
# Check for priority in resources
|
174
|
+
# Check for priority in resources
|
107
175
|
task_priority = None
|
108
176
|
if task_.resources:
|
109
177
|
# Convert set to list to access elements by index
|
@@ -121,20 +189,6 @@ def launch(
|
|
121
189
|
f'{resource.priority} but expected {task_priority}.'
|
122
190
|
)
|
123
191
|
|
124
|
-
# Check for conflict between resources priority and job
|
125
|
-
# priority
|
126
|
-
if task_.job_priority is not None:
|
127
|
-
with ux_utils.print_exception_no_traceback():
|
128
|
-
raise ValueError(
|
129
|
-
f'Task {task_.name!r}: Cannot specify both '
|
130
|
-
f'resources.priority ({task_priority}) and '
|
131
|
-
f'job.priority ({task_.job_priority}). Please use only '
|
132
|
-
'one priority specification method.')
|
133
|
-
|
134
|
-
# Fall back to job priority if no resources priority found
|
135
|
-
if task_priority is None:
|
136
|
-
task_priority = task_.job_priority
|
137
|
-
|
138
192
|
if task_priority is not None:
|
139
193
|
if (priority is not None and priority != task_priority):
|
140
194
|
with ux_utils.print_exception_no_traceback():
|
@@ -183,34 +237,7 @@ def launch(
|
|
183
237
|
f'with:\n\n`sky down {cluster_name} --purge`\n\n'
|
184
238
|
f'Reason: {common_utils.format_exception(e)}')
|
185
239
|
|
186
|
-
|
187
|
-
|
188
|
-
if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
|
189
|
-
for task_ in dag.tasks:
|
190
|
-
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
191
|
-
task_, task_type='jobs')
|
192
|
-
|
193
|
-
else:
|
194
|
-
# We do not have any cloud storage available, so fall back to
|
195
|
-
# two-hop file_mount uploading.
|
196
|
-
# Note: we can't easily hack sync_storage_mounts() to upload
|
197
|
-
# directly to the controller, because the controller may not
|
198
|
-
# even be up yet.
|
199
|
-
for task_ in dag.tasks:
|
200
|
-
if task_.storage_mounts:
|
201
|
-
# Technically, we could convert COPY storage_mounts that
|
202
|
-
# have a local source and do not specify `store`, but we
|
203
|
-
# will not do that for now. Only plain file_mounts are
|
204
|
-
# supported.
|
205
|
-
raise exceptions.NotSupportedError(
|
206
|
-
'Cloud-based file_mounts are specified, but no cloud '
|
207
|
-
'storage is available. Please specify local '
|
208
|
-
'file_mounts only.')
|
209
|
-
|
210
|
-
# Merge file mounts from all tasks.
|
211
|
-
local_to_controller_file_mounts.update(
|
212
|
-
controller_utils.translate_local_file_mounts_to_two_hop(
|
213
|
-
task_))
|
240
|
+
local_to_controller_file_mounts = _maybe_upload_files_to_controller(dag)
|
214
241
|
|
215
242
|
# Has to use `\` to avoid yapf issue.
|
216
243
|
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
@@ -233,6 +260,13 @@ def launch(
|
|
233
260
|
controller=controller,
|
234
261
|
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
235
262
|
|
263
|
+
consolidation_mode_job_id = _maybe_submit_job_locally(prefix, dag)
|
264
|
+
|
265
|
+
# This is only needed for non-consolidation mode. For consolidation
|
266
|
+
# mode, the controller uses the same catalog as API server.
|
267
|
+
modified_catalogs = {} if consolidation_mode_job_id is not None else (
|
268
|
+
service_catalog_common.get_modified_catalog_file_mounts())
|
269
|
+
|
236
270
|
vars_to_fill = {
|
237
271
|
'remote_original_user_yaml_path': remote_original_user_yaml_path,
|
238
272
|
'original_user_dag_path': original_user_yaml_path.name,
|
@@ -244,9 +278,9 @@ def launch(
|
|
244
278
|
'dag_name': dag.name,
|
245
279
|
'remote_user_config_path': remote_user_config_path,
|
246
280
|
'remote_env_file_path': remote_env_file_path,
|
247
|
-
'modified_catalogs':
|
248
|
-
service_catalog_common.get_modified_catalog_file_mounts(),
|
281
|
+
'modified_catalogs': modified_catalogs,
|
249
282
|
'priority': priority,
|
283
|
+
'consolidation_mode_job_id': consolidation_mode_job_id,
|
250
284
|
**controller_utils.shared_controller_vars_to_fill(
|
251
285
|
controller,
|
252
286
|
remote_user_config_path=remote_user_config_path,
|
@@ -285,12 +319,44 @@ def launch(
|
|
285
319
|
# workspace A, but the controller is in workspace B, the
|
286
320
|
# intermediate bucket and newly created bucket should be in
|
287
321
|
# workspace A.
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
322
|
+
if consolidation_mode_job_id is None:
|
323
|
+
return execution.launch(task=controller_task,
|
324
|
+
cluster_name=controller_name,
|
325
|
+
stream_logs=stream_logs,
|
326
|
+
retry_until_up=True,
|
327
|
+
fast=True,
|
328
|
+
_disable_controller_check=True)
|
329
|
+
# Manually launch the scheduler process in consolidation mode.
|
330
|
+
local_handle = backend_utils.is_controller_accessible(
|
331
|
+
controller=controller, stopped_message='')
|
332
|
+
backend = backend_utils.get_backend_from_handle(local_handle)
|
333
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
334
|
+
backend.sync_file_mounts(
|
335
|
+
handle=local_handle,
|
336
|
+
all_file_mounts=controller_task.file_mounts,
|
337
|
+
storage_mounts=controller_task.storage_mounts)
|
338
|
+
run_script = controller_task.run
|
339
|
+
assert isinstance(run_script, str)
|
340
|
+
# Manually add the env variables to the run script. Originally
|
341
|
+
# this is done in ray jobs submission but now we have to do it
|
342
|
+
# manually because there is no ray runtime on the API server.
|
343
|
+
env_cmds = [
|
344
|
+
f'export {k}={v!r}'
|
345
|
+
for k, v in controller_task.envs.items()
|
346
|
+
]
|
347
|
+
run_script = '\n'.join(env_cmds + [run_script])
|
348
|
+
# Dump script for high availability recovery.
|
349
|
+
if controller_utils.high_availability_specified(
|
350
|
+
controller_name):
|
351
|
+
dump_script_path = (
|
352
|
+
managed_job_utils.get_ha_dump_script_path(
|
353
|
+
consolidation_mode_job_id))
|
354
|
+
dump_script_path.parent.mkdir(parents=True, exist_ok=True)
|
355
|
+
with open(dump_script_path, 'w',
|
356
|
+
encoding='utf-8') as script_f:
|
357
|
+
script_f.write(run_script)
|
358
|
+
backend.run_on_head(local_handle, run_script)
|
359
|
+
return consolidation_mode_job_id, local_handle
|
294
360
|
|
295
361
|
|
296
362
|
def queue_from_kubernetes_pod(
|
sky/jobs/state.py
CHANGED
@@ -463,6 +463,21 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
463
463
|
entrypoint))
|
464
464
|
|
465
465
|
|
466
|
+
@_init_db
|
467
|
+
def set_job_info_without_job_id(name: str, workspace: str,
|
468
|
+
entrypoint: str) -> int:
|
469
|
+
assert _DB_PATH is not None
|
470
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
471
|
+
cursor.execute(
|
472
|
+
"""\
|
473
|
+
INSERT INTO job_info
|
474
|
+
(name, schedule_state, workspace, entrypoint)
|
475
|
+
VALUES (?, ?, ?, ?)""",
|
476
|
+
(name, ManagedJobScheduleState.INACTIVE.value, workspace,
|
477
|
+
entrypoint))
|
478
|
+
return cursor.lastrowid
|
479
|
+
|
480
|
+
|
466
481
|
@_init_db
|
467
482
|
def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
468
483
|
"""Set the task to pending state."""
|