skypilot-nightly 1.0.0.dev20241106__py3-none-any.whl → 1.0.0.dev20241108__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '1dcd2f009c1989cbc130e5b8490170a5a96c3e23'
8
+ _SKYPILOT_COMMIT_SHA = '7bea46813b47a46ce7b00501413cac637ee400a3'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241106'
38
+ __version__ = '1.0.0.dev20241108'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -3262,6 +3262,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3262
3262
  ) -> None:
3263
3263
  """Executes generated code on the head node."""
3264
3264
  style = colorama.Style
3265
+ fore = colorama.Fore
3265
3266
 
3266
3267
  script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3267
3268
  remote_log_dir = self.log_dir
@@ -3373,9 +3374,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3373
3374
  controller = controller_utils.Controllers.from_name(name)
3374
3375
  if controller == controller_utils.Controllers.JOBS_CONTROLLER:
3375
3376
  logger.info(
3376
- f'\n📋 Useful Commands'
3377
- f'\nManaged Job ID: '
3377
+ f'\n{fore.CYAN}Managed Job ID: '
3378
3378
  f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3379
+ f'\n📋 Useful Commands'
3379
3380
  f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
3380
3381
  f'{ux_utils.BOLD}sky jobs cancel {job_id}'
3381
3382
  f'{ux_utils.RESET_BOLD}'
@@ -3392,8 +3393,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3392
3393
  f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
3393
3394
  f'{ux_utils.RESET_BOLD}')
3394
3395
  elif controller is None:
3395
- logger.info(f'\n📋 Useful Commands'
3396
- f'\nJob ID: {job_id}'
3396
+ logger.info(f'\n{fore.CYAN}Job ID: '
3397
+ f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3398
+ f'\n📋 Useful Commands'
3397
3399
  f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
3398
3400
  f'{ux_utils.BOLD}sky cancel {name} {job_id}'
3399
3401
  f'{ux_utils.RESET_BOLD}'
sky/cli.py CHANGED
@@ -3036,9 +3036,9 @@ def show_gpus(
3036
3036
  and spot instances. There may be multiple regions with the same lowest
3037
3037
  price.
3038
3038
 
3039
- If ``--cloud kubernetes`` is specified, it will show the maximum quantities
3040
- of the GPU available on a single node and the real-time availability of
3041
- the GPU across all nodes in the Kubernetes cluster.
3039
+ If ``--cloud kubernetes`` or ``--cloud k8s`` is specified, it will show the
3040
+ maximum quantities of the GPU available on a single node and the real-time
3041
+ availability of the GPU across all nodes in the Kubernetes cluster.
3042
3042
 
3043
3043
  Definitions of certain fields:
3044
3044
 
@@ -20,7 +20,6 @@ DEFAULT_LAMBDA_KEYS_PATH = os.path.expanduser('~/.lambda_cloud/lambda_keys')
20
20
 
21
21
  # List of all possible regions.
22
22
  REGIONS = [
23
- 'australia-southeast-1',
24
23
  'europe-central-1',
25
24
  'asia-south-1',
26
25
  'me-west-1',
@@ -28,9 +27,12 @@ REGIONS = [
28
27
  'asia-northeast-1',
29
28
  'asia-northeast-2',
30
29
  'us-east-1',
30
+ 'us-east-2',
31
31
  'us-west-2',
32
32
  'us-west-1',
33
33
  'us-south-1',
34
+ 'us-south-2',
35
+ 'us-south-3',
34
36
  'us-west-3',
35
37
  'us-midwest-1',
36
38
  ]
@@ -26,7 +26,8 @@ def _is_running_on_jobs_controller() -> bool:
26
26
  """
27
27
  if pathlib.Path('~/.sky/sky_ray.yml').expanduser().exists():
28
28
  config = yaml.safe_load(
29
- pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text())
29
+ pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text(
30
+ encoding='utf-8'))
30
31
  cluster_name = config.get('cluster_name', '')
31
32
  candidate_controller_names = (
32
33
  controller_utils.Controllers.JOBS_CONTROLLER.value.
@@ -36,6 +36,11 @@ DEFAULT_RECOVERY_STRATEGY = None
36
36
  # 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
37
37
  MAX_JOB_CHECKING_RETRY = 10
38
38
 
39
+ # Minutes to job cluster autodown. This should be significantly larger than
40
+ # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
41
+ # cluster before its status can be updated by the job controller.
42
+ _AUTODOWN_MINUTES = 5
43
+
39
44
 
40
45
  def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
41
46
  """Terminate the cluster."""
@@ -302,11 +307,17 @@ class StrategyExecutor:
302
307
  usage_lib.messages.usage.set_internal()
303
308
  # Detach setup, so that the setup failure can be detected
304
309
  # by the controller process (job_status -> FAILED_SETUP).
305
- sky.launch(self.dag,
306
- cluster_name=self.cluster_name,
307
- detach_setup=True,
308
- detach_run=True,
309
- _is_launched_by_jobs_controller=True)
310
+ sky.launch(
311
+ self.dag,
312
+ cluster_name=self.cluster_name,
313
+ # We expect to tear down the cluster as soon as the job is
314
+ # finished. However, in case the controller dies, set
315
+ # autodown to try and avoid a resource leak.
316
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
317
+ down=True,
318
+ detach_setup=True,
319
+ detach_run=True,
320
+ _is_launched_by_jobs_controller=True)
310
321
  logger.info('Managed job cluster launched.')
311
322
  except (exceptions.InvalidClusterNameError,
312
323
  exceptions.NoCloudAccessError,
sky/jobs/state.py CHANGED
@@ -12,6 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
12
12
  import colorama
13
13
 
14
14
  from sky import sky_logging
15
+ from sky.utils import common_utils
15
16
  from sky.utils import db_utils
16
17
 
17
18
  if typing.TYPE_CHECKING:
@@ -22,23 +23,6 @@ CallbackType = Callable[[str], None]
22
23
  logger = sky_logging.init_logger(__name__)
23
24
 
24
25
 
25
- def _get_db_path() -> str:
26
- """Workaround to collapse multi-step Path ops for type checker.
27
- Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
28
- """
29
- path = pathlib.Path('~/.sky/spot_jobs.db')
30
- path = path.expanduser().absolute()
31
- path.parents[0].mkdir(parents=True, exist_ok=True)
32
- return str(path)
33
-
34
-
35
- _DB_PATH = _get_db_path()
36
-
37
- # Module-level connection/cursor; thread-safe as the module is only imported
38
- # once.
39
- _CONN = sqlite3.connect(_DB_PATH)
40
- _CURSOR = _CONN.cursor()
41
-
42
26
  # === Database schema ===
43
27
  # `spot` table contains all the finest-grained tasks, including all the
44
28
  # tasks of a managed job (called spot for legacy reason, as it is generalized
@@ -50,68 +34,99 @@ _CURSOR = _CONN.cursor()
50
34
  # identifier/primary key for all the tasks. We will use `spot_job_id`
51
35
  # to identify the spot job.
52
36
  # TODO(zhwu): schema migration may be needed.
53
- _CURSOR.execute("""\
54
- CREATE TABLE IF NOT EXISTS spot (
55
- job_id INTEGER PRIMARY KEY AUTOINCREMENT,
56
- job_name TEXT,
57
- resources TEXT,
58
- submitted_at FLOAT,
59
- status TEXT,
60
- run_timestamp TEXT CANDIDATE KEY,
61
- start_at FLOAT DEFAULT NULL,
62
- end_at FLOAT DEFAULT NULL,
63
- last_recovered_at FLOAT DEFAULT -1,
64
- recovery_count INTEGER DEFAULT 0,
65
- job_duration FLOAT DEFAULT 0,
66
- failure_reason TEXT,
67
- spot_job_id INTEGER,
68
- task_id INTEGER DEFAULT 0,
69
- task_name TEXT,
70
- specs TEXT)""")
71
- _CONN.commit()
72
-
73
- db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
74
- # Create a new column `spot_job_id`, which is the same for tasks of the
75
- # same managed job.
76
- # The original `job_id` no longer has an actual meaning, but only a legacy
77
- # identifier for all tasks in database.
78
- db_utils.add_column_to_table(_CURSOR,
79
- _CONN,
80
- 'spot',
81
- 'spot_job_id',
82
- 'INTEGER',
83
- copy_from='job_id')
84
- db_utils.add_column_to_table(_CURSOR,
85
- _CONN,
86
- 'spot',
87
- 'task_id',
88
- 'INTEGER DEFAULT 0',
89
- value_to_replace_existing_entries=0)
90
- db_utils.add_column_to_table(_CURSOR,
91
- _CONN,
92
- 'spot',
93
- 'task_name',
94
- 'TEXT',
95
- copy_from='job_name')
96
-
97
- # Specs is some useful information about the task, e.g., the
98
- # max_restarts_on_errors value. It is stored in JSON format.
99
- db_utils.add_column_to_table(_CURSOR,
100
- _CONN,
101
- 'spot',
102
- 'specs',
103
- 'TEXT',
104
- value_to_replace_existing_entries=json.dumps({
105
- 'max_restarts_on_errors': 0,
106
- }))
107
-
108
- # `job_info` contains the mapping from job_id to the job_name.
109
- # In the future, it may contain more information about each job.
110
- _CURSOR.execute("""\
111
- CREATE TABLE IF NOT EXISTS job_info (
112
- spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
113
- name TEXT)""")
114
- _CONN.commit()
37
+ def create_table(cursor, conn):
38
+ # Enable WAL mode to avoid locking issues.
39
+ # See: issue #3863, #1441 and PR #1509
40
+ # https://github.com/microsoft/WSL/issues/2395
41
+ # TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
42
+ # This may cause the database locked problem from WSL issue #1441.
43
+ if not common_utils.is_wsl():
44
+ try:
45
+ cursor.execute('PRAGMA journal_mode=WAL')
46
+ except sqlite3.OperationalError as e:
47
+ if 'database is locked' not in str(e):
48
+ raise
49
+ # If the database is locked, it is OK to continue, as the WAL mode
50
+ # is not critical and is likely to be enabled by other processes.
51
+
52
+ cursor.execute("""\
53
+ CREATE TABLE IF NOT EXISTS spot (
54
+ job_id INTEGER PRIMARY KEY AUTOINCREMENT,
55
+ job_name TEXT,
56
+ resources TEXT,
57
+ submitted_at FLOAT,
58
+ status TEXT,
59
+ run_timestamp TEXT CANDIDATE KEY,
60
+ start_at FLOAT DEFAULT NULL,
61
+ end_at FLOAT DEFAULT NULL,
62
+ last_recovered_at FLOAT DEFAULT -1,
63
+ recovery_count INTEGER DEFAULT 0,
64
+ job_duration FLOAT DEFAULT 0,
65
+ failure_reason TEXT,
66
+ spot_job_id INTEGER,
67
+ task_id INTEGER DEFAULT 0,
68
+ task_name TEXT,
69
+ specs TEXT)""")
70
+ conn.commit()
71
+
72
+ db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
73
+ # Create a new column `spot_job_id`, which is the same for tasks of the
74
+ # same managed job.
75
+ # The original `job_id` no longer has an actual meaning, but only a legacy
76
+ # identifier for all tasks in database.
77
+ db_utils.add_column_to_table(cursor,
78
+ conn,
79
+ 'spot',
80
+ 'spot_job_id',
81
+ 'INTEGER',
82
+ copy_from='job_id')
83
+ db_utils.add_column_to_table(cursor,
84
+ conn,
85
+ 'spot',
86
+ 'task_id',
87
+ 'INTEGER DEFAULT 0',
88
+ value_to_replace_existing_entries=0)
89
+ db_utils.add_column_to_table(cursor,
90
+ conn,
91
+ 'spot',
92
+ 'task_name',
93
+ 'TEXT',
94
+ copy_from='job_name')
95
+
96
+ # Specs is some useful information about the task, e.g., the
97
+ # max_restarts_on_errors value. It is stored in JSON format.
98
+ db_utils.add_column_to_table(cursor,
99
+ conn,
100
+ 'spot',
101
+ 'specs',
102
+ 'TEXT',
103
+ value_to_replace_existing_entries=json.dumps({
104
+ 'max_restarts_on_errors': 0,
105
+ }))
106
+
107
+ # `job_info` contains the mapping from job_id to the job_name.
108
+ # In the future, it may contain more information about each job.
109
+ cursor.execute("""\
110
+ CREATE TABLE IF NOT EXISTS job_info (
111
+ spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
112
+ name TEXT)""")
113
+ conn.commit()
114
+
115
+
116
+ # Module-level connection/cursor; thread-safe as the module is only imported
117
+ # once.
118
+ def _get_db_path() -> str:
119
+ """Workaround to collapse multi-step Path ops for type checker.
120
+ Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
121
+ """
122
+ path = pathlib.Path('~/.sky/spot_jobs.db')
123
+ path = path.expanduser().absolute()
124
+ path.parents[0].mkdir(parents=True, exist_ok=True)
125
+ return str(path)
126
+
127
+
128
+ _DB_PATH = _get_db_path()
129
+ db_utils.SQLiteConn(_DB_PATH, create_table)
115
130
 
116
131
  # job_duration is the time a job actually runs (including the
117
132
  # setup duration) before last_recover, excluding the provision
sky/jobs/utils.py CHANGED
@@ -14,7 +14,7 @@ import shutil
14
14
  import textwrap
15
15
  import time
16
16
  import typing
17
- from typing import Any, Dict, List, Optional, Tuple, Union
17
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
18
18
 
19
19
  import colorama
20
20
  import filelock
@@ -487,6 +487,7 @@ def stream_logs(job_id: Optional[int],
487
487
  job_id = managed_job_state.get_latest_job_id()
488
488
  if job_id is None:
489
489
  return 'No managed job found.'
490
+
490
491
  if controller:
491
492
  if job_id is None:
492
493
  assert job_name is not None
@@ -494,16 +495,22 @@ def stream_logs(job_id: Optional[int],
494
495
  # We manually filter the jobs by name, instead of using
495
496
  # get_nonterminal_job_ids_by_name, as with `controller=True`, we
496
497
  # should be able to show the logs for jobs in terminal states.
497
- managed_jobs = list(
498
- filter(lambda job: job['job_name'] == job_name, managed_jobs))
499
- if len(managed_jobs) == 0:
498
+ managed_job_ids: Set[int] = {
499
+ job['job_id']
500
+ for job in managed_jobs
501
+ if job['job_name'] == job_name
502
+ }
503
+ if len(managed_job_ids) == 0:
500
504
  return f'No managed job found with name {job_name!r}.'
501
- if len(managed_jobs) > 1:
502
- job_ids_str = ', '.join(job['job_id'] for job in managed_jobs)
503
- raise ValueError(
504
- f'Multiple managed jobs found with name {job_name!r} (Job '
505
- f'IDs: {job_ids_str}). Please specify the job_id instead.')
506
- job_id = managed_jobs[0]['job_id']
505
+ if len(managed_job_ids) > 1:
506
+ job_ids_str = ', '.join(
507
+ str(job_id) for job_id in managed_job_ids)
508
+ with ux_utils.print_exception_no_traceback():
509
+ raise ValueError(
510
+ f'Multiple managed jobs found with name {job_name!r} '
511
+ f'(Job IDs: {job_ids_str}). Please specify the job_id '
512
+ 'instead.')
513
+ job_id = managed_job_ids.pop()
507
514
  assert job_id is not None, (job_id, job_name)
508
515
  # TODO: keep the following code sync with
509
516
  # job_lib.JobLibCodeGen.tail_logs, we do not directly call that function
@@ -849,6 +856,7 @@ class ManagedJobCodeGen:
849
856
 
850
857
  from sky.skylet import job_lib, log_lib
851
858
  from sky.skylet import constants
859
+ from sky.utils import ux_utils
852
860
  try:
853
861
  from sky.jobs.utils import stream_logs_by_id
854
862
  except ImportError:
@@ -264,6 +264,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
264
264
  f'--disable-usage-stats '
265
265
  f'--port={constants.SKY_REMOTE_RAY_PORT} '
266
266
  f'--dashboard-port={constants.SKY_REMOTE_RAY_DASHBOARD_PORT} '
267
+ f'--min-worker-port 11002 '
267
268
  f'--object-manager-port=8076 '
268
269
  f'--temp-dir={constants.SKY_REMOTE_RAY_TEMPDIR}')
269
270
  if custom_resource:
@@ -82,7 +82,7 @@ def raise_lambda_error(response: requests.Response) -> None:
82
82
  if status_code == 200:
83
83
  return
84
84
  if status_code == 429:
85
- # https://docs.lambdalabs.com/cloud/rate-limiting/
85
+ # https://docs.lambdalabs.com/public-cloud/cloud-api/
86
86
  raise LambdaCloudError('Your API requests are being rate limited.')
87
87
  try:
88
88
  resp_json = response.json()
@@ -145,7 +145,7 @@ class LambdaCloudClient:
145
145
  # Most API requests are rate limited at ~1 request every second but
146
146
  # launch requests are rate limited at ~1 request every 10 seconds.
147
147
  # So don't use launch requests to check availability.
148
- # See https://docs.lambdalabs.com/cloud/rate-limiting/ for more.
148
+ # See https://docs.lambdalabs.com/public-cloud/cloud-api/ for more.
149
149
  available_regions = (self.list_catalog()[instance_type]
150
150
  ['regions_with_capacity_available'])
151
151
  available_regions = [reg['name'] for reg in available_regions]
@@ -232,7 +232,12 @@ def query_ports(
232
232
  instances = _filter_instances(cluster_name_on_cloud,
233
233
  None,
234
234
  head_only=True)
235
- assert len(instances) == 1
235
+ assert len(instances) <= 1
236
+ # It is possible that the instance is terminated on console by
237
+ # the user. In this case, the instance will not be found and we
238
+ # should return an empty dict.
239
+ if not instances:
240
+ return {}
236
241
  head_inst = list(instances.values())[0]
237
242
  ready_ports: Dict[int, List[common.Endpoint]] = {
238
243
  port: [common.SocketEndpoint(**endpoint)]
sky/skylet/job_lib.py CHANGED
@@ -181,14 +181,19 @@ class JobScheduler:
181
181
  subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL)
182
182
 
183
183
  def schedule_step(self, force_update_jobs: bool = False) -> None:
184
- jobs = self._get_jobs()
185
- if len(jobs) > 0 or force_update_jobs:
184
+ if force_update_jobs:
186
185
  update_status()
186
+ pending_jobs = self._get_pending_jobs()
187
187
  # TODO(zhwu, mraheja): One optimization can be allowing more than one
188
188
  # job staying in the pending state after ray job submit, so that to be
189
189
  # faster to schedule a large amount of jobs.
190
- for job_id, run_cmd, submit, created_time in jobs:
190
+ for job_id, run_cmd, submit, created_time in pending_jobs:
191
191
  with filelock.FileLock(_get_lock_path(job_id)):
192
+ # We don't have to refresh the job status before checking, as
193
+ # the job status will only be stale in rare cases where ray job
194
+ # crashes; or the job stays in INIT state for a long time.
195
+ # In those cases, the periodic JobSchedulerEvent event will
196
+ # update the job status every 300 seconds.
192
197
  status = get_status_no_lock(job_id)
193
198
  if (status not in _PRE_RESOURCE_STATUSES or
194
199
  created_time < psutil.boot_time()):
@@ -202,7 +207,7 @@ class JobScheduler:
202
207
  self._run_job(job_id, run_cmd)
203
208
  return
204
209
 
205
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
210
+ def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
206
211
  """Returns the metadata for jobs in the pending jobs table
207
212
 
208
213
  The information contains job_id, run command, submit time,
@@ -214,7 +219,7 @@ class JobScheduler:
214
219
  class FIFOScheduler(JobScheduler):
215
220
  """First in first out job scheduler"""
216
221
 
217
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
222
+ def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
218
223
  return list(
219
224
  _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
220
225
 
@@ -534,25 +539,13 @@ def update_job_status(job_ids: List[int],
534
539
 
535
540
  This function should only be run on the remote instance with ray>=2.4.0.
536
541
  """
542
+ echo = logger.info if not silent else logger.debug
537
543
  if len(job_ids) == 0:
538
544
  return []
539
545
 
540
- # TODO: if too slow, directly query against redis.
541
546
  ray_job_ids = [make_ray_job_id(job_id) for job_id in job_ids]
542
-
543
547
  job_client = _create_ray_job_submission_client()
544
548
 
545
- # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
546
- # which contains the job status (str) and submission_id (str).
547
- ray_job_query_time = time.time()
548
- job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
549
-
550
- job_details = {}
551
- ray_job_ids_set = set(ray_job_ids)
552
- for job_detail in job_detail_lists:
553
- if job_detail.submission_id in ray_job_ids_set:
554
- job_details[job_detail.submission_id] = job_detail
555
-
556
549
  statuses = []
557
550
  for job_id, ray_job_id in zip(job_ids, ray_job_ids):
558
551
  # Per-job status lock is required because between the job status
@@ -560,15 +553,48 @@ def update_job_status(job_ids: List[int],
560
553
  # can be modified by the generated ray program.
561
554
  with filelock.FileLock(_get_lock_path(job_id)):
562
555
  status = None
563
- if ray_job_id in job_details:
564
- ray_status = job_details[ray_job_id].status
565
- status = _RAY_TO_JOB_STATUS_MAP[ray_status]
556
+ job_record = _get_jobs_by_ids([job_id])[0]
557
+ original_status = job_record['status']
558
+ job_submitted_at = job_record['submitted_at']
559
+
560
+ ray_job_query_time = time.time()
561
+ if original_status == JobStatus.INIT:
562
+ if (job_submitted_at >= psutil.boot_time() and job_submitted_at
563
+ >= ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
564
+ # The job id is reserved, but the job is not submitted yet.
565
+ # We should keep it in INIT.
566
+ status = JobStatus.INIT
567
+ else:
568
+ # We always immediately submit job after the job id is
569
+ # allocated, i.e. INIT -> PENDING, if a job stays in INIT
570
+ # for too long, it is likely the job submission process
571
+ # was killed before the job is submitted. We should set it
572
+ # to FAILED then. Note, if ray job indicates the job is
573
+ # running, we will change status to PENDING below.
574
+ echo(f'INIT job {job_id} is stale, setting to FAILED')
575
+ status = JobStatus.FAILED
576
+
577
+ try:
578
+ # Querying status within the lock is safer than querying
579
+ # outside, as it avoids the race condition when job table is
580
+ # updated after the ray job status query.
581
+ # Also, getting per-job status is faster than querying all jobs,
582
+ # when there are significant number of finished jobs.
583
+ # Reference: getting 124 finished jobs takes 0.038s, while
584
+ # querying a single job takes 0.006s, 10 jobs takes 0.066s.
585
+ # TODO: if too slow, directly query against redis.
586
+ ray_job_status = job_client.get_job_status(ray_job_id)
587
+ status = _RAY_TO_JOB_STATUS_MAP[ray_job_status.value]
588
+ except RuntimeError:
589
+ # Job not found.
590
+ pass
591
+
566
592
  pending_job = _get_pending_job(job_id)
567
593
  if pending_job is not None:
568
594
  if pending_job['created_time'] < psutil.boot_time():
569
- logger.info(f'Job {job_id} is stale, setting to FAILED: '
570
- f'created_time={pending_job["created_time"]}, '
571
- f'boot_time={psutil.boot_time()}')
595
+ echo(f'Job {job_id} is stale, setting to FAILED: '
596
+ f'created_time={pending_job["created_time"]}, '
597
+ f'boot_time={psutil.boot_time()}')
572
598
  # The job is stale as it is created before the instance
573
599
  # is booted, e.g. the instance is rebooted.
574
600
  status = JobStatus.FAILED
@@ -583,22 +609,20 @@ def update_job_status(job_ids: List[int],
583
609
  # as stale.
584
610
  status = JobStatus.PENDING
585
611
 
586
- original_status = get_status_no_lock(job_id)
587
612
  assert original_status is not None, (job_id, status)
588
613
  if status is None:
589
614
  status = original_status
590
615
  if (original_status is not None and
591
616
  not original_status.is_terminal()):
592
- logger.info(f'Ray job status for job {job_id} is None, '
593
- 'setting it to FAILED.')
617
+ echo(f'Ray job status for job {job_id} is None, '
618
+ 'setting it to FAILED.')
594
619
  # The job may be stale, when the instance is restarted
595
620
  # (the ray redis is volatile). We need to reset the
596
621
  # status of the task to FAILED if its original status
597
622
  # is RUNNING or PENDING.
598
623
  status = JobStatus.FAILED
599
624
  _set_status_no_lock(job_id, status)
600
- if not silent:
601
- logger.info(f'Updated job {job_id} status to {status}')
625
+ echo(f'Updated job {job_id} status to {status}')
602
626
  else:
603
627
  # Taking max of the status is necessary because:
604
628
  # 1. It avoids race condition, where the original status has
@@ -611,10 +635,10 @@ def update_job_status(job_ids: List[int],
611
635
  # DB) would already have that value. So we take the max here to
612
636
  # keep it at later status.
613
637
  status = max(status, original_status)
638
+ assert status is not None, (job_id, status, original_status)
614
639
  if status != original_status: # Prevents redundant update.
615
640
  _set_status_no_lock(job_id, status)
616
- if not silent:
617
- logger.info(f'Updated job {job_id} status to {status}')
641
+ echo(f'Updated job {job_id} status to {status}')
618
642
  statuses.append(status)
619
643
  return statuses
620
644
 
@@ -180,7 +180,7 @@ class SCPNodeProvider(NodeProvider):
180
180
  metadata['tags'] = instance_info['tags']
181
181
  # TODO(ewzeng): The internal ip is hard to get, so set it to the
182
182
  # external ip as a hack. This should be changed in the future.
183
- # https://docs.lambdalabs.com/cloud/learn-private-ip-address/
183
+ # https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
184
184
  metadata['internal_ip'] = vm['ip']
185
185
  metadata['external_ip'] = vm['external_ip']
186
186
  return metadata
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241106
3
+ Version: 1.0.0.dev20241108
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=gJi4nCnW9_tfOdSmOh1s0EemDMl3aeTk1lG8K9lrsHA,5882
1
+ sky/__init__.py,sha256=3bl83kLcrLpOZrRBB4ZOcINvEGjYSmx_kP8Aypn8fbc,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
- sky/cli.py,sha256=6umPcFovU5sHIUdC0B9lfOstzWLA0DPS5x6dg1EOkeQ,211193
5
+ sky/cli.py,sha256=STcQ0jaLicXahQOCruebuRrRa94KouQPF_P_EVP1CjI,211212
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
33
  sky/backends/backend_utils.py,sha256=2myfryj1zG9xxPaX6XYYJruxAOGNGbpsy2ckT4A77sE,121813
34
- sky/backends/cloud_vm_ray_backend.py,sha256=jlX1atSF4L31ZMzC_tnBaWnxvc2Wb8DRwt5G_ukrlJk,232799
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=yxsyqzA_jubsWhpFUmeTowxUPUj20M6jo9kkBI1Tbw4,232913
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
@@ -79,7 +79,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzE
79
79
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
80
80
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
81
81
  sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=mDAN98T58h1g_LLyppSEUVDlsbLhk2454Nhmg5-aw0Q,32670
82
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38zayuJGgUrD1PJYJKiVZHGnwH6JJop3F7o0,4918
82
+ sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=yOPmmckiQ0HU6bKXWd7YdTrsF2sql3Bs_jYNpuxlo0I,4942
83
83
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
@@ -97,17 +97,17 @@ sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
97
97
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
98
98
  sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
99
99
  sky/jobs/core.py,sha256=w7PancHi8_-afLKZQ3HHMD1sEDoepm1vEMxyDlXdo64,17155
100
- sky/jobs/recovery_strategy.py,sha256=FpPK6e2PT61cZPDUJqIfo6g53uSRTBh7dOTbfR1DLVE,26672
101
- sky/jobs/state.py,sha256=exN6BdJlLBzFTccJCSHN4dNjVeYFgTgqgxOaHwLw2IQ,24307
102
- sky/jobs/utils.py,sha256=pF4Kyl4v1M_Bmm2jIRlXGTSdII5BJ3f4qwex_oCFgBk,37742
103
- sky/jobs/dashboard/dashboard.py,sha256=FXVQAWjAuQQTfAGlTCD-Xb9LckC5I4NhGwiBZy8Avo8,3186
100
+ sky/jobs/recovery_strategy.py,sha256=O_DouAfWx8FNdQxXsr2msMwlKCIodS99cW6V4Lf1vMo,27219
101
+ sky/jobs/state.py,sha256=DE02bCZc9bPbbuayb3Zml553mb4pEV7Z8t1pt8IGbYM,25252
102
+ sky/jobs/utils.py,sha256=Ff3TttIEdVeM1_kOVkviqIDjeVfBPIXVE8i-yP1VDM8,37976
103
+ sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
104
104
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
105
105
  sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
106
106
  sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
107
107
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
108
108
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
109
109
  sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
110
- sky/provision/instance_setup.py,sha256=c6i_NC6GrW4hXAQIU5_dUBbnThjZQNS3cL2M6yMtzes,23616
110
+ sky/provision/instance_setup.py,sha256=gI739UMCqtPqdA522D92bPu5sA3OHBMDmIGmqqxsIwY,23652
111
111
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
112
112
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
113
113
  sky/provision/provisioner.py,sha256=mTvtBjS-Xz64LJcyeHx_-wdM8Gin8D49YRaV_TADaz4,25334
@@ -146,7 +146,7 @@ sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=
146
146
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
147
147
  sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
148
148
  sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
149
- sky/provision/lambda_cloud/lambda_utils.py,sha256=H8uaaMEpLn5cqGCdhUH_oJiccv_cuMguUNAl0NqB0Ik,9873
149
+ sky/provision/lambda_cloud/lambda_utils.py,sha256=wIXV1Qe362f8Q9u8DSx2e9IJs4CF03Jr3idHCzhlRz4,9879
150
150
  sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
151
151
  sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
152
152
  sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
@@ -154,7 +154,7 @@ sky/provision/paperspace/instance.py,sha256=q_V01DZSMXLfy63Zwt6AQotq02JuXQZb5CHS
154
154
  sky/provision/paperspace/utils.py,sha256=uOmxbDKjV6skFizC4gYXSxRuEqso5ck2kF7MbtNmhEs,9580
155
155
  sky/provision/runpod/__init__.py,sha256=6HYvHI27EaLrX1SS0vWVhdLu5HDBeZCdvAeDJuwM5pk,556
156
156
  sky/provision/runpod/config.py,sha256=9ulZJVL7nHuxhTdoj8D7lNn7SdicJ5zc6FIcHIG9tcg,321
157
- sky/provision/runpod/instance.py,sha256=ucmFQEzapbxylsl6K9EUo7bHTZYzvfECo6tpJc-MFrw,9577
157
+ sky/provision/runpod/instance.py,sha256=AIWzTHuAu2dw8Rk-AHc7-14hUAYPEKh_UMzAhMzjDh0,9807
158
158
  sky/provision/runpod/utils.py,sha256=ZjrcpjKzwS2nXQ21dW405PLxBl_V9awcfRjucGB3alw,6795
159
159
  sky/provision/vsphere/__init__.py,sha256=5icB8-kfs926S9DVfNJSCBVr7z7cmCEDr04-YHX89_4,788
160
160
  sky/provision/vsphere/config.py,sha256=f_ojGmi_vbnwJ8Ri48cqhZHBOuIkj41j9bFbq-ldPOo,504
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
192
192
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
193
193
  sky/skylet/constants.py,sha256=TL-O0ZoxA1ZeNvKXzzA_UyIMXsma7flbsDZ1N_o9dKg,14468
194
194
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
195
- sky/skylet/job_lib.py,sha256=jqJ4D3UeG6fNMm8xPtdWclnrVHQb6WiRqb1nrBp8TPg,35200
195
+ sky/skylet/job_lib.py,sha256=-SCbpJRiWMSwvhDjUwfwnvBap7Y5B3ol1l_PDPra3XI,36860
196
196
  sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
197
197
  sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
198
198
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -209,7 +209,7 @@ sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gM
209
209
  sky/skylet/providers/oci/utils.py,sha256=lCpdklxgSwK-hqErTicpIe_xkpSlIc8u943C-9_MJfU,508
210
210
  sky/skylet/providers/scp/__init__.py,sha256=15SiAh1YphXkZsHySaw_CeAmXRdoM4JtNIAt7SLbUvg,91
211
211
  sky/skylet/providers/scp/config.py,sha256=lhMXyG9btMlg59nmvtnMdIDN07jBbQOheAx-bHbGbhw,5077
212
- sky/skylet/providers/scp/node_provider.py,sha256=5HjFEGqKAqVcszEpcan_IzY9NKVMQdm2BUgGdfw9aUY,22411
212
+ sky/skylet/providers/scp/node_provider.py,sha256=W5J-170JVIpwT9Fv20fJ_PpdAVsqx9pigE-RkkG_kQE,22459
213
213
  sky/skylet/ray_patches/__init__.py,sha256=IoCzj9zFaHW-n__bLD8dgC2pJMTfZRxRpr8rZGvMyrw,2761
214
214
  sky/skylet/ray_patches/autoscaler.py.patch,sha256=cZK15L29aay-qx6JoGVPNsPIo3UiG0bAHh8fqfFd-44,291
215
215
  sky/skylet/ray_patches/cli.py.patch,sha256=ooEAr3OfA6LN7v4iaNltY6w63TaOFssgw9iKWP49iJc,349
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
276
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241106.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241106.dist-info/METADATA,sha256=xDHkghCeZl-VGNYV5hps-0I-il3EKeUO9Rg7JcokqPI,19708
279
- skypilot_nightly-1.0.0.dev20241106.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
- skypilot_nightly-1.0.0.dev20241106.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241106.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241106.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241108.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241108.dist-info/METADATA,sha256=5DHqRTobJ2Irrs9uV-6ixI4dUDQFGvFRVnp3KCDT9pc,19708
279
+ skypilot_nightly-1.0.0.dev20241108.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
+ skypilot_nightly-1.0.0.dev20241108.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241108.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241108.dist-info/RECORD,,