skypilot-nightly 1.0.0.dev20250615__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/cloud_vm_ray_backend.py +43 -60
- sky/cli.py +55 -637
- sky/client/cli.py +55 -637
- sky/clouds/kubernetes.py +3 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/provision/__init__.py +1 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/skylet/constants.py +39 -0
- sky/skylet/job_lib.py +8 -0
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +51 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +19 -36
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +43 -5
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
@@ -352,6 +352,16 @@ class ManagedJobStatus(enum.Enum):
|
|
352
352
|
cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER
|
353
353
|
]
|
354
354
|
|
355
|
+
@classmethod
|
356
|
+
def processing_statuses(cls) -> List['ManagedJobStatus']:
|
357
|
+
# Any status that is not terminal and is not CANCELLING.
|
358
|
+
return [
|
359
|
+
cls.PENDING,
|
360
|
+
cls.STARTING,
|
361
|
+
cls.RUNNING,
|
362
|
+
cls.RECOVERING,
|
363
|
+
]
|
364
|
+
|
355
365
|
|
356
366
|
_SPOT_STATUS_TO_COLOR = {
|
357
367
|
ManagedJobStatus.PENDING: colorama.Fore.BLUE,
|
@@ -607,21 +617,49 @@ def set_started(job_id: int, task_id: int, start_time: float,
|
|
607
617
|
|
608
618
|
|
609
619
|
@_init_db
|
610
|
-
def set_recovering(job_id: int, task_id: int,
|
620
|
+
def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
|
621
|
+
callback_func: CallbackType):
|
611
622
|
"""Set the task to recovering state, and update the job duration."""
|
612
623
|
assert _DB_PATH is not None
|
613
624
|
logger.info('=== Recovering... ===')
|
625
|
+
expected_status: List[str] = [ManagedJobStatus.RUNNING.value]
|
626
|
+
status_str = 'status=(?)'
|
627
|
+
if force_transit_to_recovering:
|
628
|
+
# For the HA job controller, it is possible that the jobs came from any
|
629
|
+
# processing status to recovering. But it should not be any terminal
|
630
|
+
# status as such jobs will not be recovered; and it should not be
|
631
|
+
# CANCELLING as we will directly trigger a cleanup.
|
632
|
+
expected_status = [
|
633
|
+
s.value for s in ManagedJobStatus.processing_statuses()
|
634
|
+
]
|
635
|
+
question_mark_str = ', '.join(['?'] * len(expected_status))
|
636
|
+
status_str = f'status IN ({question_mark_str})'
|
637
|
+
# NOTE: if we are resuming from a controller failure and the previous status
|
638
|
+
# is STARTING, the initial value of `last_recovered_at` might not be set
|
639
|
+
# yet (default value -1). In this case, we should not add current timestamp.
|
640
|
+
# Otherwise, the job duration will be incorrect (~55 years from 1970).
|
641
|
+
current_time = time.time()
|
614
642
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
615
643
|
cursor.execute(
|
616
|
-
"""\
|
644
|
+
f"""\
|
617
645
|
UPDATE spot SET
|
618
|
-
status=(?),
|
646
|
+
status=(?),
|
647
|
+
job_duration=CASE
|
648
|
+
WHEN last_recovered_at >= 0
|
649
|
+
THEN job_duration+(?)-last_recovered_at
|
650
|
+
ELSE job_duration
|
651
|
+
END,
|
652
|
+
last_recovered_at=CASE
|
653
|
+
WHEN last_recovered_at < 0
|
654
|
+
THEN (?)
|
655
|
+
ELSE last_recovered_at
|
656
|
+
END
|
619
657
|
WHERE spot_job_id=(?) AND
|
620
658
|
task_id=(?) AND
|
621
|
-
|
659
|
+
{status_str} AND
|
622
660
|
end_at IS null""",
|
623
|
-
(ManagedJobStatus.RECOVERING.value,
|
624
|
-
|
661
|
+
(ManagedJobStatus.RECOVERING.value, current_time, current_time,
|
662
|
+
job_id, task_id, *expected_status))
|
625
663
|
if cursor.rowcount != 1:
|
626
664
|
raise exceptions.ManagedJobStatusError(
|
627
665
|
f'Failed to set the task to recovering. '
|
@@ -996,6 +1034,19 @@ def _get_all_task_ids_statuses(
|
|
996
1034
|
return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
|
997
1035
|
|
998
1036
|
|
1037
|
+
@_init_db
|
1038
|
+
def get_job_status_with_task_id(job_id: int,
|
1039
|
+
task_id: int) -> Optional[ManagedJobStatus]:
|
1040
|
+
assert _DB_PATH is not None
|
1041
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1042
|
+
status = cursor.execute(
|
1043
|
+
"""\
|
1044
|
+
SELECT status FROM spot
|
1045
|
+
WHERE spot_job_id=(?) AND task_id=(?)""",
|
1046
|
+
(job_id, task_id)).fetchone()
|
1047
|
+
return ManagedJobStatus(status[0]) if status else None
|
1048
|
+
|
1049
|
+
|
999
1050
|
def get_num_tasks(job_id: int) -> int:
|
1000
1051
|
return len(_get_all_task_ids_statuses(job_id))
|
1001
1052
|
|
@@ -1156,8 +1207,15 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
|
|
1156
1207
|
@_init_db
|
1157
1208
|
def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
1158
1209
|
original_user_yaml_path: str, env_file_path: str,
|
1159
|
-
user_hash: str, priority: int) ->
|
1160
|
-
"""Do not call without holding the scheduler lock.
|
1210
|
+
user_hash: str, priority: int) -> bool:
|
1211
|
+
"""Do not call without holding the scheduler lock.
|
1212
|
+
|
1213
|
+
Returns: Whether this is a recovery run or not.
|
1214
|
+
If this is a recovery run, the job may already be in the WAITING
|
1215
|
+
state and the update will not change the schedule_state (hence the
|
1216
|
+
updated_count will be 0). In this case, we return True.
|
1217
|
+
Otherwise, we return False.
|
1218
|
+
"""
|
1161
1219
|
assert _DB_PATH is not None
|
1162
1220
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1163
1221
|
updated_count = cursor.execute(
|
@@ -1169,7 +1227,9 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
1169
1227
|
(ManagedJobScheduleState.WAITING.value, dag_yaml_path,
|
1170
1228
|
original_user_yaml_path, env_file_path, user_hash, priority,
|
1171
1229
|
job_id, ManagedJobScheduleState.INACTIVE.value)).rowcount
|
1172
|
-
|
1230
|
+
# For a recovery run, the job may already be in the WAITING state.
|
1231
|
+
assert updated_count <= 1, (job_id, updated_count)
|
1232
|
+
return updated_count == 0
|
1173
1233
|
|
1174
1234
|
|
1175
1235
|
@_init_db
|
sky/jobs/utils.py
CHANGED
@@ -176,6 +176,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
176
176
|
Note: we expect that job_id, if provided, refers to a nonterminal job or a
|
177
177
|
job that has not completed its cleanup (schedule state not DONE).
|
178
178
|
"""
|
179
|
+
# This signal file suggests that the controller is recovering from a
|
180
|
+
# failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
|
181
|
+
# When restarting the controller processes, we don't want this event to
|
182
|
+
# set the job status to FAILED_CONTROLLER.
|
183
|
+
# TODO(tian): Change this to restart the controller process. For now we
|
184
|
+
# disabled it when recovering because we want to avoid caveats of infinite
|
185
|
+
# restart of last controller process that fully occupied the controller VM.
|
186
|
+
if os.path.exists(
|
187
|
+
os.path.expanduser(
|
188
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
189
|
+
return
|
179
190
|
|
180
191
|
def _cleanup_job_clusters(job_id: int) -> Optional[str]:
|
181
192
|
"""Clean up clusters for a job. Returns error message if any.
|
sky/provision/__init__.py
CHANGED
@@ -24,6 +24,7 @@ from sky.provision import lambda_cloud
|
|
24
24
|
from sky.provision import nebius
|
25
25
|
from sky.provision import oci
|
26
26
|
from sky.provision import runpod
|
27
|
+
from sky.provision import scp
|
27
28
|
from sky.provision import ssh
|
28
29
|
from sky.provision import vast
|
29
30
|
from sky.provision import vsphere
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""SCP provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.scp.config import bootstrap_instances
|
4
|
+
from sky.provision.scp.instance import cleanup_ports
|
5
|
+
from sky.provision.scp.instance import get_cluster_info
|
6
|
+
from sky.provision.scp.instance import open_ports
|
7
|
+
from sky.provision.scp.instance import query_instances
|
8
|
+
from sky.provision.scp.instance import run_instances
|
9
|
+
from sky.provision.scp.instance import stop_instances
|
10
|
+
from sky.provision.scp.instance import terminate_instances
|
11
|
+
from sky.provision.scp.instance import wait_instances
|
12
|
+
|
13
|
+
__all__ = ('bootstrap_instances', 'cleanup_ports', 'get_cluster_info',
|
14
|
+
'open_ports', 'query_instances', 'run_instances', 'stop_instances',
|
15
|
+
'terminate_instances', 'wait_instances')
|
@@ -0,0 +1,93 @@
|
|
1
|
+
"""SCP configuration bootstrapping."""
|
2
|
+
|
3
|
+
import subprocess
|
4
|
+
|
5
|
+
from sky.clouds.utils import scp_utils
|
6
|
+
from sky.provision import common
|
7
|
+
|
8
|
+
|
9
|
+
def bootstrap_instances(
|
10
|
+
region: str, cluster_name: str,
|
11
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
12
|
+
"""Bootstraps instances for the given cluster."""
|
13
|
+
del cluster_name
|
14
|
+
|
15
|
+
node_cfg = config.node_config
|
16
|
+
zone_id = _get_zone_id(region)
|
17
|
+
node_cfg['zone_id'] = zone_id
|
18
|
+
|
19
|
+
docker_cfg = config.docker_config
|
20
|
+
docker_cfg['imageId'] = node_cfg['imageId']
|
21
|
+
docker_cfg['serviceZoneId'] = zone_id
|
22
|
+
docker_cfg['serverType'] = node_cfg['InstanceType']
|
23
|
+
docker_cfg['contractId'] = 'None'
|
24
|
+
ssh_public_key = node_cfg['AuthorizedKey']
|
25
|
+
docker_cfg['initialScript'] = _get_init_script(ssh_public_key)
|
26
|
+
|
27
|
+
key_pair_id = _get_key_pair_id()
|
28
|
+
miscellaneous = {
|
29
|
+
'deletionProtectionEnabled': False,
|
30
|
+
'keyPairId': key_pair_id,
|
31
|
+
'blockStorage': {
|
32
|
+
'blockStorageName': 'skystorage',
|
33
|
+
'diskSize': node_cfg['diskSize'],
|
34
|
+
'encryptEnabled': False,
|
35
|
+
'productId': 'PRODUCT-sRlJ34iBr9hOxN9J5PrQxo'
|
36
|
+
},
|
37
|
+
'nic': {
|
38
|
+
'natEnabled': True
|
39
|
+
},
|
40
|
+
}
|
41
|
+
|
42
|
+
docker_cfg.update(miscellaneous)
|
43
|
+
|
44
|
+
return config
|
45
|
+
|
46
|
+
|
47
|
+
def _get_zone_id(region_name: str):
|
48
|
+
zone_contents = scp_utils.SCPClient().get_zones()
|
49
|
+
zone_dict = {
|
50
|
+
item['serviceZoneName']: item['serviceZoneId'] for item in zone_contents
|
51
|
+
}
|
52
|
+
return zone_dict[region_name]
|
53
|
+
|
54
|
+
|
55
|
+
def _get_init_script(ssh_public_key: str):
|
56
|
+
init_script_content = _get_default_config_cmd() + _get_ssh_key_gen_cmd(
|
57
|
+
ssh_public_key)
|
58
|
+
init_script_content_string = f'"{init_script_content}"'
|
59
|
+
command = f'echo {init_script_content_string} | base64'
|
60
|
+
result = subprocess.run(command,
|
61
|
+
shell=True,
|
62
|
+
capture_output=True,
|
63
|
+
text=True,
|
64
|
+
check=True)
|
65
|
+
init_script_content_base64 = result.stdout
|
66
|
+
return {
|
67
|
+
'encodingType': 'base64',
|
68
|
+
'initialScriptShell': 'bash',
|
69
|
+
'initialScriptType': 'text',
|
70
|
+
'initialScriptContent': init_script_content_base64
|
71
|
+
}
|
72
|
+
|
73
|
+
|
74
|
+
def _get_default_config_cmd():
|
75
|
+
cmd_list = ['apt-get update', 'apt-get -y install python3-pip']
|
76
|
+
res = ''
|
77
|
+
for cmd in cmd_list:
|
78
|
+
res += cmd + '; '
|
79
|
+
return res
|
80
|
+
|
81
|
+
|
82
|
+
def _get_ssh_key_gen_cmd(ssh_public_key: str):
|
83
|
+
cmd_st = 'mkdir -p ~/.ssh/; touch ~/.ssh/authorized_keys;'
|
84
|
+
cmd_ed = 'chmod 644 ~/.ssh/authorized_keys; chmod 700 ~/.ssh/'
|
85
|
+
cmd = "echo '{}' &>>~/.ssh/authorized_keys;".format(ssh_public_key) # pylint: disable=invalid-string-quote
|
86
|
+
return cmd_st + cmd + cmd_ed
|
87
|
+
|
88
|
+
|
89
|
+
def _get_key_pair_id():
|
90
|
+
key_pairs = scp_utils.SCPClient().get_key_pairs()
|
91
|
+
if key_pairs['totalCount'] == 0:
|
92
|
+
raise RuntimeError('create key pair')
|
93
|
+
return key_pairs['contents'][0]['keyPairId']
|