skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/cloud_vm_ray_backend.py +43 -60
  3. sky/cli.py +55 -637
  4. sky/client/cli.py +55 -637
  5. sky/clouds/kubernetes.py +3 -0
  6. sky/clouds/scp.py +7 -26
  7. sky/clouds/utils/scp_utils.py +177 -124
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/dashboard/out/users.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/jobs/controller.py +98 -31
  26. sky/jobs/scheduler.py +37 -29
  27. sky/jobs/server/core.py +36 -3
  28. sky/jobs/state.py +69 -9
  29. sky/jobs/utils.py +11 -0
  30. sky/provision/__init__.py +1 -0
  31. sky/provision/scp/__init__.py +15 -0
  32. sky/provision/scp/config.py +93 -0
  33. sky/provision/scp/instance.py +528 -0
  34. sky/resources.py +164 -29
  35. sky/skylet/constants.py +39 -0
  36. sky/skylet/job_lib.py +8 -0
  37. sky/task.py +171 -21
  38. sky/templates/kubernetes-ray.yml.j2 +51 -4
  39. sky/templates/scp-ray.yml.j2 +3 -50
  40. sky/users/permission.py +19 -36
  41. sky/utils/command_runner.py +1 -1
  42. sky/utils/common_utils.py +16 -14
  43. sky/utils/context.py +1 -1
  44. sky/utils/controller_utils.py +12 -3
  45. sky/utils/dag_utils.py +17 -4
  46. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  47. sky/utils/schemas.py +43 -5
  48. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
  49. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
  50. sky/benchmark/__init__.py +0 -0
  51. sky/benchmark/benchmark_state.py +0 -295
  52. sky/benchmark/benchmark_utils.py +0 -641
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  55. sky/skylet/providers/scp/__init__.py +0 -2
  56. sky/skylet/providers/scp/config.py +0 -149
  57. sky/skylet/providers/scp/node_provider.py +0 -578
  58. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
  59. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
  60. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
  61. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
  62. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -352,6 +352,16 @@ class ManagedJobStatus(enum.Enum):
352
352
  cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER
353
353
  ]
354
354
 
355
+ @classmethod
356
+ def processing_statuses(cls) -> List['ManagedJobStatus']:
357
+ # Any status that is not terminal and is not CANCELLING.
358
+ return [
359
+ cls.PENDING,
360
+ cls.STARTING,
361
+ cls.RUNNING,
362
+ cls.RECOVERING,
363
+ ]
364
+
355
365
 
356
366
  _SPOT_STATUS_TO_COLOR = {
357
367
  ManagedJobStatus.PENDING: colorama.Fore.BLUE,
@@ -607,21 +617,49 @@ def set_started(job_id: int, task_id: int, start_time: float,
607
617
 
608
618
 
609
619
  @_init_db
610
- def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
620
+ def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
621
+ callback_func: CallbackType):
611
622
  """Set the task to recovering state, and update the job duration."""
612
623
  assert _DB_PATH is not None
613
624
  logger.info('=== Recovering... ===')
625
+ expected_status: List[str] = [ManagedJobStatus.RUNNING.value]
626
+ status_str = 'status=(?)'
627
+ if force_transit_to_recovering:
628
+ # For the HA job controller, it is possible that the jobs came from any
629
+ # processing status to recovering. But it should not be any terminal
630
+ # status as such jobs will not be recovered; and it should not be
631
+ # CANCELLING as we will directly trigger a cleanup.
632
+ expected_status = [
633
+ s.value for s in ManagedJobStatus.processing_statuses()
634
+ ]
635
+ question_mark_str = ', '.join(['?'] * len(expected_status))
636
+ status_str = f'status IN ({question_mark_str})'
637
+ # NOTE: if we are resuming from a controller failure and the previous status
638
+ # is STARTING, the initial value of `last_recovered_at` might not be set
639
+ # yet (default value -1). In this case, we should not add current timestamp.
640
+ # Otherwise, the job duration will be incorrect (~55 years from 1970).
641
+ current_time = time.time()
614
642
  with db_utils.safe_cursor(_DB_PATH) as cursor:
615
643
  cursor.execute(
616
- """\
644
+ f"""\
617
645
  UPDATE spot SET
618
- status=(?), job_duration=job_duration+(?)-last_recovered_at
646
+ status=(?),
647
+ job_duration=CASE
648
+ WHEN last_recovered_at >= 0
649
+ THEN job_duration+(?)-last_recovered_at
650
+ ELSE job_duration
651
+ END,
652
+ last_recovered_at=CASE
653
+ WHEN last_recovered_at < 0
654
+ THEN (?)
655
+ ELSE last_recovered_at
656
+ END
619
657
  WHERE spot_job_id=(?) AND
620
658
  task_id=(?) AND
621
- status=(?) AND
659
+ {status_str} AND
622
660
  end_at IS null""",
623
- (ManagedJobStatus.RECOVERING.value, time.time(), job_id, task_id,
624
- ManagedJobStatus.RUNNING.value))
661
+ (ManagedJobStatus.RECOVERING.value, current_time, current_time,
662
+ job_id, task_id, *expected_status))
625
663
  if cursor.rowcount != 1:
626
664
  raise exceptions.ManagedJobStatusError(
627
665
  f'Failed to set the task to recovering. '
@@ -996,6 +1034,19 @@ def _get_all_task_ids_statuses(
996
1034
  return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
997
1035
 
998
1036
 
1037
+ @_init_db
1038
+ def get_job_status_with_task_id(job_id: int,
1039
+ task_id: int) -> Optional[ManagedJobStatus]:
1040
+ assert _DB_PATH is not None
1041
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
1042
+ status = cursor.execute(
1043
+ """\
1044
+ SELECT status FROM spot
1045
+ WHERE spot_job_id=(?) AND task_id=(?)""",
1046
+ (job_id, task_id)).fetchone()
1047
+ return ManagedJobStatus(status[0]) if status else None
1048
+
1049
+
999
1050
  def get_num_tasks(job_id: int) -> int:
1000
1051
  return len(_get_all_task_ids_statuses(job_id))
1001
1052
 
@@ -1156,8 +1207,15 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
1156
1207
  @_init_db
1157
1208
  def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1158
1209
  original_user_yaml_path: str, env_file_path: str,
1159
- user_hash: str, priority: int) -> None:
1160
- """Do not call without holding the scheduler lock."""
1210
+ user_hash: str, priority: int) -> bool:
1211
+ """Do not call without holding the scheduler lock.
1212
+
1213
+ Returns: Whether this is a recovery run or not.
1214
+ If this is a recovery run, the job may already be in the WAITING
1215
+ state and the update will not change the schedule_state (hence the
1216
+ updated_count will be 0). In this case, we return True.
1217
+ Otherwise, we return False.
1218
+ """
1161
1219
  assert _DB_PATH is not None
1162
1220
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1163
1221
  updated_count = cursor.execute(
@@ -1169,7 +1227,9 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1169
1227
  (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
1170
1228
  original_user_yaml_path, env_file_path, user_hash, priority,
1171
1229
  job_id, ManagedJobScheduleState.INACTIVE.value)).rowcount
1172
- assert updated_count == 1, (job_id, updated_count)
1230
+ # For a recovery run, the job may already be in the WAITING state.
1231
+ assert updated_count <= 1, (job_id, updated_count)
1232
+ return updated_count == 0
1173
1233
 
1174
1234
 
1175
1235
  @_init_db
sky/jobs/utils.py CHANGED
@@ -176,6 +176,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
176
176
  Note: we expect that job_id, if provided, refers to a nonterminal job or a
177
177
  job that has not completed its cleanup (schedule state not DONE).
178
178
  """
179
+ # This signal file suggests that the controller is recovering from a
180
+ # failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
181
+ # When restarting the controller processes, we don't want this event to
182
+ # set the job status to FAILED_CONTROLLER.
183
+ # TODO(tian): Change this to restart the controller process. For now we
184
+ # disabled it when recovering because we want to avoid caveats of infinite
185
+ # restart of last controller process that fully occupied the controller VM.
186
+ if os.path.exists(
187
+ os.path.expanduser(
188
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
189
+ return
179
190
 
180
191
  def _cleanup_job_clusters(job_id: int) -> Optional[str]:
181
192
  """Clean up clusters for a job. Returns error message if any.
sky/provision/__init__.py CHANGED
@@ -24,6 +24,7 @@ from sky.provision import lambda_cloud
24
24
  from sky.provision import nebius
25
25
  from sky.provision import oci
26
26
  from sky.provision import runpod
27
+ from sky.provision import scp
27
28
  from sky.provision import ssh
28
29
  from sky.provision import vast
29
30
  from sky.provision import vsphere
@@ -0,0 +1,15 @@
1
+ """SCP provisioner for SkyPilot."""
2
+
3
+ from sky.provision.scp.config import bootstrap_instances
4
+ from sky.provision.scp.instance import cleanup_ports
5
+ from sky.provision.scp.instance import get_cluster_info
6
+ from sky.provision.scp.instance import open_ports
7
+ from sky.provision.scp.instance import query_instances
8
+ from sky.provision.scp.instance import run_instances
9
+ from sky.provision.scp.instance import stop_instances
10
+ from sky.provision.scp.instance import terminate_instances
11
+ from sky.provision.scp.instance import wait_instances
12
+
13
+ __all__ = ('bootstrap_instances', 'cleanup_ports', 'get_cluster_info',
14
+ 'open_ports', 'query_instances', 'run_instances', 'stop_instances',
15
+ 'terminate_instances', 'wait_instances')
@@ -0,0 +1,93 @@
1
+ """SCP configuration bootstrapping."""
2
+
3
+ import subprocess
4
+
5
+ from sky.clouds.utils import scp_utils
6
+ from sky.provision import common
7
+
8
+
9
+ def bootstrap_instances(
10
+ region: str, cluster_name: str,
11
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
12
+ """Bootstraps instances for the given cluster."""
13
+ del cluster_name
14
+
15
+ node_cfg = config.node_config
16
+ zone_id = _get_zone_id(region)
17
+ node_cfg['zone_id'] = zone_id
18
+
19
+ docker_cfg = config.docker_config
20
+ docker_cfg['imageId'] = node_cfg['imageId']
21
+ docker_cfg['serviceZoneId'] = zone_id
22
+ docker_cfg['serverType'] = node_cfg['InstanceType']
23
+ docker_cfg['contractId'] = 'None'
24
+ ssh_public_key = node_cfg['AuthorizedKey']
25
+ docker_cfg['initialScript'] = _get_init_script(ssh_public_key)
26
+
27
+ key_pair_id = _get_key_pair_id()
28
+ miscellaneous = {
29
+ 'deletionProtectionEnabled': False,
30
+ 'keyPairId': key_pair_id,
31
+ 'blockStorage': {
32
+ 'blockStorageName': 'skystorage',
33
+ 'diskSize': node_cfg['diskSize'],
34
+ 'encryptEnabled': False,
35
+ 'productId': 'PRODUCT-sRlJ34iBr9hOxN9J5PrQxo'
36
+ },
37
+ 'nic': {
38
+ 'natEnabled': True
39
+ },
40
+ }
41
+
42
+ docker_cfg.update(miscellaneous)
43
+
44
+ return config
45
+
46
+
47
+ def _get_zone_id(region_name: str):
48
+ zone_contents = scp_utils.SCPClient().get_zones()
49
+ zone_dict = {
50
+ item['serviceZoneName']: item['serviceZoneId'] for item in zone_contents
51
+ }
52
+ return zone_dict[region_name]
53
+
54
+
55
+ def _get_init_script(ssh_public_key: str):
56
+ init_script_content = _get_default_config_cmd() + _get_ssh_key_gen_cmd(
57
+ ssh_public_key)
58
+ init_script_content_string = f'"{init_script_content}"'
59
+ command = f'echo {init_script_content_string} | base64'
60
+ result = subprocess.run(command,
61
+ shell=True,
62
+ capture_output=True,
63
+ text=True,
64
+ check=True)
65
+ init_script_content_base64 = result.stdout
66
+ return {
67
+ 'encodingType': 'base64',
68
+ 'initialScriptShell': 'bash',
69
+ 'initialScriptType': 'text',
70
+ 'initialScriptContent': init_script_content_base64
71
+ }
72
+
73
+
74
+ def _get_default_config_cmd():
75
+ cmd_list = ['apt-get update', 'apt-get -y install python3-pip']
76
+ res = ''
77
+ for cmd in cmd_list:
78
+ res += cmd + '; '
79
+ return res
80
+
81
+
82
+ def _get_ssh_key_gen_cmd(ssh_public_key: str):
83
+ cmd_st = 'mkdir -p ~/.ssh/; touch ~/.ssh/authorized_keys;'
84
+ cmd_ed = 'chmod 644 ~/.ssh/authorized_keys; chmod 700 ~/.ssh/'
85
+ cmd = "echo '{}' &>>~/.ssh/authorized_keys;".format(ssh_public_key) # pylint: disable=invalid-string-quote
86
+ return cmd_st + cmd + cmd_ed
87
+
88
+
89
+ def _get_key_pair_id():
90
+ key_pairs = scp_utils.SCPClient().get_key_pairs()
91
+ if key_pairs['totalCount'] == 0:
92
+ raise RuntimeError('create key pair')
93
+ return key_pairs['contents'][0]['keyPairId']