skypilot-nightly 1.0.0.dev20250817__py3-none-any.whl → 1.0.0.dev20250819__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/catalog/data_fetchers/fetch_aws.py +2 -0
  3. sky/dashboard/out/404.html +1 -1
  4. sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +1 -0
  5. sky/dashboard/out/_next/static/chunks/8969-23c8fbdb8b397d59.js +1 -0
  6. sky/dashboard/out/_next/static/chunks/{webpack-b6987eb47888da9c.js → webpack-008593a02784a2df.js} +1 -1
  7. sky/dashboard/out/_next/static/{s93sHgT13r_pnzP1An3gW → tYn7R2be3cQPYJfTxxE09}/_buildManifest.js +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/jobs/constants.py +1 -1
  24. sky/jobs/server/core.py +42 -33
  25. sky/jobs/server/utils.py +2 -1
  26. sky/jobs/utils.py +56 -9
  27. sky/provision/provisioner.py +10 -6
  28. sky/server/common.py +2 -4
  29. sky/server/requests/payloads.py +1 -0
  30. sky/server/requests/serializers/encoders.py +15 -3
  31. sky/server/server.py +4 -1
  32. sky/setup_files/MANIFEST.in +1 -0
  33. sky/skylet/ray_patches/__init__.py +17 -3
  34. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  35. sky/skylet/ray_patches/cli.py.diff +19 -0
  36. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  37. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  38. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  39. sky/skylet/ray_patches/updater.py.diff +18 -0
  40. sky/skylet/ray_patches/worker.py.diff +41 -0
  41. {skypilot_nightly-1.0.0.dev20250817.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/METADATA +1 -1
  42. {skypilot_nightly-1.0.0.dev20250817.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/RECORD +47 -40
  43. sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +0 -1
  44. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +0 -1
  45. /sky/dashboard/out/_next/static/{s93sHgT13r_pnzP1An3gW → tYn7R2be3cQPYJfTxxE09}/_ssgManifest.js +0 -0
  46. {skypilot_nightly-1.0.0.dev20250817.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/WHEEL +0 -0
  47. {skypilot_nightly-1.0.0.dev20250817.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/entry_points.txt +0 -0
  48. {skypilot_nightly-1.0.0.dev20250817.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/licenses/LICENSE +0 -0
  49. {skypilot_nightly-1.0.0.dev20250817.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -514,7 +514,7 @@ def queue_from_kubernetes_pod(
514
514
  except exceptions.CommandError as e:
515
515
  raise RuntimeError(str(e)) from e
516
516
 
517
- jobs, _, result_type = managed_job_utils.load_managed_job_queue(
517
+ jobs, _, result_type, _, _ = managed_job_utils.load_managed_job_queue(
518
518
  job_table_payload)
519
519
 
520
520
  if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
@@ -587,31 +587,36 @@ def queue(
587
587
  pool_match: Optional[str] = None,
588
588
  page: Optional[int] = None,
589
589
  limit: Optional[int] = None,
590
- ) -> Tuple[List[Dict[str, Any]], int]:
590
+ statuses: Optional[List[str]] = None,
591
+ ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
591
592
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
592
593
  """Gets statuses of managed jobs.
593
594
 
594
595
  Please refer to sky.cli.job_queue for documentation.
595
596
 
596
597
  Returns:
597
- [
598
- {
599
- 'job_id': int,
600
- 'job_name': str,
601
- 'resources': str,
602
- 'submitted_at': (float) timestamp of submission,
603
- 'end_at': (float) timestamp of end,
604
- 'job_duration': (float) duration in seconds,
605
- 'recovery_count': (int) Number of retries,
606
- 'status': (sky.jobs.ManagedJobStatus) of the job,
607
- 'cluster_resources': (str) resources of the cluster,
608
- 'region': (str) region of the cluster,
609
- 'user_name': (Optional[str]) job creator's user name,
610
- 'user_hash': (str) job creator's user hash,
611
- 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
612
- 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
613
- }
614
- ]
598
+ jobs: List[Dict[str, Any]]
599
+ [
600
+ {
601
+ 'job_id': int,
602
+ 'job_name': str,
603
+ 'resources': str,
604
+ 'submitted_at': (float) timestamp of submission,
605
+ 'end_at': (float) timestamp of end,
606
+ 'job_duration': (float) duration in seconds,
607
+ 'recovery_count': (int) Number of retries,
608
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
609
+ 'cluster_resources': (str) resources of the cluster,
610
+ 'region': (str) region of the cluster,
611
+ 'user_name': (Optional[str]) job creator's user name,
612
+ 'user_hash': (str) job creator's user hash,
613
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
614
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
615
+ }
616
+ ]
617
+ total: int, total number of jobs after filter
618
+ status_counts: Dict[str, int], status counts after filter
619
+ total_no_filter: int, total number of jobs before filter
615
620
  Raises:
616
621
  sky.exceptions.ClusterNotUpError: the jobs controller is not up or
617
622
  does not exist.
@@ -645,13 +650,13 @@ def queue(
645
650
  elif user_match is not None:
646
651
  users = global_user_state.get_user_by_name_match(user_match)
647
652
  if not users:
648
- return [], 0
653
+ return [], 0, {}, 0
649
654
  user_hashes = [user.id for user in users]
650
655
 
651
656
  accessible_workspaces = list(workspaces_core.get_workspaces().keys())
652
657
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
653
658
  skip_finished, accessible_workspaces, job_ids, workspace_match,
654
- name_match, pool_match, page, limit, user_hashes)
659
+ name_match, pool_match, page, limit, user_hashes, statuses)
655
660
  returncode, job_table_payload, stderr = backend.run_on_head(
656
661
  handle,
657
662
  code,
@@ -664,11 +669,11 @@ def queue(
664
669
  raise RuntimeError('Failed to fetch managed jobs with returncode: '
665
670
  f'{returncode}.\n{job_table_payload + stderr}')
666
671
 
667
- jobs, total, result_type = managed_job_utils.load_managed_job_queue(
668
- job_table_payload)
672
+ (jobs, total, result_type, total_no_filter, status_counts
673
+ ) = managed_job_utils.load_managed_job_queue(job_table_payload)
669
674
 
670
675
  if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
671
- return jobs, total
676
+ return jobs, total, status_counts, total_no_filter
672
677
 
673
678
  # Backward compatibility for old jobs controller without filtering
674
679
  # TODO(hailong): remove this after 0.12.0
@@ -702,14 +707,18 @@ def queue(
702
707
  if job_ids:
703
708
  jobs = [job for job in jobs if job['job_id'] in job_ids]
704
709
 
705
- return managed_job_utils.filter_jobs(jobs,
706
- workspace_match,
707
- name_match,
708
- pool_match,
709
- page=page,
710
- limit=limit,
711
- user_match=user_match,
712
- enable_user_match=True)
710
+ filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
711
+ jobs,
712
+ workspace_match,
713
+ name_match,
714
+ pool_match,
715
+ page=page,
716
+ limit=limit,
717
+ user_match=user_match,
718
+ enable_user_match=True,
719
+ statuses=statuses,
720
+ )
721
+ return filtered_jobs, total, status_counts, total_no_filter
713
722
 
714
723
 
715
724
  @usage_lib.entrypoint
sky/jobs/server/utils.py CHANGED
@@ -62,7 +62,8 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
62
62
  version_matches = controller_version == local_version
63
63
 
64
64
  # Load and filter jobs locally using existing method
65
- jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
65
+ jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
66
+ job_table_payload)
66
67
  non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
67
68
  has_non_terminal_jobs = len(non_terminal_jobs) > 0
68
69
 
sky/jobs/utils.py CHANGED
@@ -768,6 +768,13 @@ def stream_logs_by_id(job_id: int,
768
768
  assert tail > 0
769
769
  # Read only the last 'tail' lines using deque
770
770
  read_from = collections.deque(f, maxlen=tail)
771
+ # We set start_streaming to True here in case
772
+ # truncating the log file removes the line that
773
+ # contains LOG_FILE_START_STREAMING_AT. This does
774
+ # not cause issues for log files shorter than tail
775
+ # because tail_logs in sky/skylet/log_lib.py also
776
+ # handles LOG_FILE_START_STREAMING_AT.
777
+ start_streaming = True
771
778
  for line in read_from:
772
779
  if log_lib.LOG_FILE_START_STREAMING_AT in line:
773
780
  start_streaming = True
@@ -1133,6 +1140,7 @@ def dump_managed_job_queue(
1133
1140
  page: Optional[int] = None,
1134
1141
  limit: Optional[int] = None,
1135
1142
  user_hashes: Optional[List[Optional[str]]] = None,
1143
+ statuses: Optional[List[str]] = None,
1136
1144
  ) -> str:
1137
1145
  # Make sure to get all jobs - some logic below (e.g. high priority job
1138
1146
  # detection) requires a full view of the jobs table.
@@ -1160,6 +1168,8 @@ def dump_managed_job_queue(
1160
1168
  if priority is not None and priority > highest_blocking_priority:
1161
1169
  highest_blocking_priority = priority
1162
1170
 
1171
+ total_no_filter = len(jobs)
1172
+
1163
1173
  if user_hashes:
1164
1174
  jobs = [
1165
1175
  job for job in jobs if job.get('user_hash', None) in user_hashes
@@ -1183,8 +1193,13 @@ def dump_managed_job_queue(
1183
1193
  if job_ids:
1184
1194
  jobs = [job for job in jobs if job['job_id'] in job_ids]
1185
1195
 
1186
- jobs, total = filter_jobs(jobs, workspace_match, name_match, pool_match,
1187
- page, limit)
1196
+ jobs, total, status_counts = filter_jobs(jobs,
1197
+ workspace_match,
1198
+ name_match,
1199
+ pool_match,
1200
+ page,
1201
+ limit,
1202
+ statuses=statuses)
1188
1203
  for job in jobs:
1189
1204
  end_at = job['end_at']
1190
1205
  if end_at is None:
@@ -1258,7 +1273,12 @@ def dump_managed_job_queue(
1258
1273
  else:
1259
1274
  job['details'] = None
1260
1275
 
1261
- return message_utils.encode_payload({'jobs': jobs, 'total': total})
1276
+ return message_utils.encode_payload({
1277
+ 'jobs': jobs,
1278
+ 'total': total,
1279
+ 'total_no_filter': total_no_filter,
1280
+ 'status_counts': status_counts
1281
+ })
1262
1282
 
1263
1283
 
1264
1284
  def filter_jobs(
@@ -1270,7 +1290,8 @@ def filter_jobs(
1270
1290
  limit: Optional[int],
1271
1291
  user_match: Optional[str] = None,
1272
1292
  enable_user_match: bool = False,
1273
- ) -> Tuple[List[Dict[str, Any]], int]:
1293
+ statuses: Optional[List[str]] = None,
1294
+ ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int]]:
1274
1295
  """Filter jobs based on the given criteria.
1275
1296
 
1276
1297
  Args:
@@ -1282,9 +1303,12 @@ def filter_jobs(
1282
1303
  limit: Limit to filter.
1283
1304
  user_match: User name to filter.
1284
1305
  enable_user_match: Whether to enable user match.
1306
+ statuses: Statuses to filter.
1285
1307
 
1286
1308
  Returns:
1287
- List of filtered jobs and total number of jobs.
1309
+ List of filtered jobs
1310
+ Total number of jobs
1311
+ Dictionary of status counts
1288
1312
  """
1289
1313
 
1290
1314
  # TODO(hailong): refactor the whole function including the
@@ -1314,6 +1338,7 @@ def filter_jobs(
1314
1338
  end = min(start + limit, len(result))
1315
1339
  return result[start:end]
1316
1340
 
1341
+ status_counts: Dict[str, int] = collections.defaultdict(int)
1317
1342
  result = []
1318
1343
  checks = [
1319
1344
  ('workspace', workspace_match),
@@ -1327,25 +1352,34 @@ def filter_jobs(
1327
1352
  if not all(
1328
1353
  _pattern_matches(job, key, pattern) for key, pattern in checks):
1329
1354
  continue
1355
+ status_counts[job['status'].value] += 1
1356
+ if statuses:
1357
+ if job['status'].value not in statuses:
1358
+ continue
1330
1359
  result.append(job)
1331
1360
 
1332
1361
  total = len(result)
1333
1362
 
1334
- return _handle_page_and_limit(result, page, limit), total
1363
+ return _handle_page_and_limit(result, page, limit), total, status_counts
1335
1364
 
1336
1365
 
1337
1366
  def load_managed_job_queue(
1338
1367
  payload: str
1339
- ) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType]:
1368
+ ) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType, int, Dict[
1369
+ str, int]]:
1340
1370
  """Load job queue from json string."""
1341
1371
  result = message_utils.decode_payload(payload)
1342
1372
  result_type = ManagedJobQueueResultType.DICT
1373
+ status_counts = {}
1343
1374
  if isinstance(result, dict):
1344
1375
  jobs = result['jobs']
1345
1376
  total = result['total']
1377
+ status_counts = result.get('status_counts', {})
1378
+ total_no_filter = result.get('total_no_filter', total)
1346
1379
  else:
1347
1380
  jobs = result
1348
1381
  total = len(jobs)
1382
+ total_no_filter = total
1349
1383
  result_type = ManagedJobQueueResultType.LIST
1350
1384
 
1351
1385
  for job in jobs:
@@ -1355,7 +1389,7 @@ def load_managed_job_queue(
1355
1389
  # TODO(cooperc): Remove check before 0.12.0.
1356
1390
  user = global_user_state.get_user(job['user_hash'])
1357
1391
  job['user_name'] = user.name if user is not None else None
1358
- return jobs, total, result_type
1392
+ return jobs, total, result_type, total_no_filter, status_counts
1359
1393
 
1360
1394
 
1361
1395
  def _get_job_status_from_tasks(
@@ -1713,6 +1747,7 @@ class ManagedJobCodeGen:
1713
1747
  page: Optional[int] = None,
1714
1748
  limit: Optional[int] = None,
1715
1749
  user_hashes: Optional[List[Optional[str]]] = None,
1750
+ statuses: Optional[List[str]] = None,
1716
1751
  ) -> str:
1717
1752
  code = textwrap.dedent(f"""\
1718
1753
  if managed_job_version < 9:
@@ -1720,7 +1755,7 @@ class ManagedJobCodeGen:
1720
1755
  # before #6652.
1721
1756
  # TODO(hailong): Remove compatibility before 0.12.0
1722
1757
  job_table = utils.dump_managed_job_queue()
1723
- else:
1758
+ elif managed_job_version < 10:
1724
1759
  job_table = utils.dump_managed_job_queue(
1725
1760
  skip_finished={skip_finished},
1726
1761
  accessible_workspaces={accessible_workspaces!r},
@@ -1731,6 +1766,18 @@ class ManagedJobCodeGen:
1731
1766
  page={page!r},
1732
1767
  limit={limit!r},
1733
1768
  user_hashes={user_hashes!r})
1769
+ else:
1770
+ job_table = utils.dump_managed_job_queue(
1771
+ skip_finished={skip_finished},
1772
+ accessible_workspaces={accessible_workspaces!r},
1773
+ job_ids={job_ids!r},
1774
+ workspace_match={workspace_match!r},
1775
+ name_match={name_match!r},
1776
+ pool_match={pool_match!r},
1777
+ page={page!r},
1778
+ limit={limit!r},
1779
+ user_hashes={user_hashes!r},
1780
+ statuses={statuses!r})
1734
1781
  print(job_table, flush=True)
1735
1782
  """)
1736
1783
  return cls._build(code)
@@ -167,7 +167,7 @@ def bulk_provision(
167
167
  # This error is a user error instead of a provisioning failure.
168
168
  # And there is no possibility to fix it by teardown.
169
169
  raise
170
- except Exception: # pylint: disable=broad-except
170
+ except Exception as exc: # pylint: disable=broad-except
171
171
  zone_str = 'all zones'
172
172
  if zones:
173
173
  zone_str = ','.join(zone.name for zone in zones)
@@ -189,14 +189,18 @@ def bulk_provision(
189
189
  provider_config=original_config['provider'])
190
190
  break
191
191
  except NotImplementedError as e:
192
- verb = 'terminate' if terminate else 'stop'
192
+ assert not terminate, (
193
+ 'Terminating must be supported by all clouds')
194
+ exc_msg = common_utils.format_exception(exc).replace(
195
+ '\n', ' ')
193
196
  # If the underlying cloud does not support stopping
194
197
  # instances, we should stop failover as well.
195
198
  raise provision_common.StopFailoverError(
196
- 'During provisioner\'s failover, '
197
- f'{terminate_str.lower()} {cluster_name!r} failed. '
198
- f'We cannot {verb} the resources launched, as it is '
199
- f'not supported by {cloud}. Please try launching the '
199
+ f'Provisioning cluster {cluster_name.display_name} '
200
+ f'failed: {exc_msg}. Failover is stopped for safety '
201
+ 'because the cluster was previously in UP state but '
202
+ f'{cloud} does not support stopping instances to '
203
+ 'preserve the cluster state. Please try launching the '
200
204
  'cluster again, or terminate it with: '
201
205
  f'sky down {cluster_name.display_name}') from e
202
206
  except Exception as e: # pylint: disable=broad-except
sky/server/common.py CHANGED
@@ -561,15 +561,13 @@ def _start_api_server(deploy: bool = False,
561
561
  # For spawn mode, copy the environ to avoid polluting the SDK process.
562
562
  server_env = os.environ.copy()
563
563
  server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
564
- _set_metrics_env_var(server_env, metrics, deploy)
565
564
  # Start the API server process in the background and don't wait for it.
566
565
  # If this is called from a CLI invocation, we need
567
566
  # start_new_session=True so that SIGINT on the CLI will not also kill
568
567
  # the API server.
569
- server_env = os.environ.copy()
570
- server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
571
568
  if enable_basic_auth:
572
569
  server_env[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
570
+ _set_metrics_env_var(server_env, metrics, deploy)
573
571
  with open(log_path, 'w', encoding='utf-8') as log_file:
574
572
  # Because the log file is opened using a with statement, it may seem
575
573
  # that the file will be closed when the with statement is exited
@@ -643,7 +641,7 @@ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
643
641
  deploy: Whether the server is running in deploy mode, which means
644
642
  multiple processes might be running.
645
643
  """
646
- if metrics:
644
+ if metrics or os.getenv(constants.ENV_VAR_SERVER_METRICS_ENABLED) == 'true':
647
645
  env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
648
646
  if deploy:
649
647
  metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
@@ -503,6 +503,7 @@ class JobsQueueBody(RequestBody):
503
503
  pool_match: Optional[str] = None
504
504
  page: Optional[int] = None
505
505
  limit: Optional[int] = None
506
+ statuses: Optional[List[str]] = None
506
507
 
507
508
 
508
509
  class JobsCancelBody(RequestBody):
@@ -113,8 +113,15 @@ def encode_status_kubernetes(
113
113
  @register_encoder('jobs.queue')
114
114
  def encode_jobs_queue(jobs_or_tuple):
115
115
  # Support returning either a plain jobs list or a (jobs, total) tuple
116
- if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
117
- jobs, total = jobs_or_tuple
116
+ status_counts = {}
117
+ if isinstance(jobs_or_tuple, tuple):
118
+ if len(jobs_or_tuple) == 2:
119
+ jobs, total = jobs_or_tuple
120
+ total_no_filter = total
121
+ elif len(jobs_or_tuple) == 4:
122
+ jobs, total, status_counts, total_no_filter = jobs_or_tuple
123
+ else:
124
+ raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
118
125
  else:
119
126
  jobs = jobs_or_tuple
120
127
  total = None
@@ -122,7 +129,12 @@ def encode_jobs_queue(jobs_or_tuple):
122
129
  job['status'] = job['status'].value
123
130
  if total is None:
124
131
  return jobs
125
- return {'jobs': jobs, 'total': total}
132
+ return {
133
+ 'jobs': jobs,
134
+ 'total': total,
135
+ 'total_no_filter': total_no_filter,
136
+ 'status_counts': status_counts
137
+ }
126
138
 
127
139
 
128
140
  def _encode_serve_status(
sky/server/server.py CHANGED
@@ -1650,7 +1650,10 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1650
1650
  await websocket.accept()
1651
1651
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1652
1652
 
1653
- cluster_records = core.status(cluster_name, all_users=True)
1653
+ # Run core.status in another thread to avoid blocking the event loop.
1654
+ cluster_records = await context_utils.to_thread(core.status,
1655
+ cluster_name,
1656
+ all_users=True)
1654
1657
  cluster_record = cluster_records[0]
1655
1658
  if cluster_record['status'] != status_lib.ClusterStatus.UP:
1656
1659
  raise fastapi.HTTPException(
@@ -9,6 +9,7 @@ include sky/skylet/providers/ibm/*
9
9
  include sky/skylet/providers/scp/*
10
10
  include sky/skylet/providers/*.py
11
11
  include sky/skylet/ray_patches/*.patch
12
+ include sky/skylet/ray_patches/*.diff
12
13
  include sky/jobs/dashboard/*
13
14
  include sky/jobs/dashboard/templates/*
14
15
  include sky/jobs/dashboard/static/*
@@ -40,15 +40,29 @@ def _run_patch(target_file,
40
40
  """Applies a patch if it has not been applied already."""
41
41
  # .orig is the original file that is not patched.
42
42
  orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
43
+ # Get diff filename by replacing .patch with .diff
44
+ diff_file = patch_file.replace('.patch', '.diff')
45
+
43
46
  script = f"""\
44
47
  which patch >/dev/null 2>&1 || sudo yum install -y patch || true
45
- which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
46
48
  if [ ! -f {orig_file} ]; then
47
49
  echo Create backup file {orig_file}
48
50
  cp {target_file} {orig_file}
49
51
  fi
50
- # It is ok to patch again from the original file.
51
- patch {orig_file} -i {patch_file} -o {target_file}
52
+ if which patch >/dev/null 2>&1; then
53
+ # System patch command is available, use it
54
+ # It is ok to patch again from the original file.
55
+ patch {orig_file} -i {patch_file} -o {target_file}
56
+ else
57
+ # System patch command not available, use Python patch library
58
+ echo "System patch command not available, using Python patch library..."
59
+ python -m pip install patch
60
+ # Get target directory
61
+ target_dir="$(dirname {target_file})"
62
+ # Execute python patch command
63
+ echo "Executing python -m patch -d $target_dir {diff_file}"
64
+ python -m patch -d "$target_dir" "{diff_file}"
65
+ fi
52
66
  """
53
67
  subprocess.run(script, shell=True, check=True)
54
68
 
@@ -0,0 +1,18 @@
1
+ --- a/autoscaler.py
2
+ +++ b/autoscaler.py
3
+ @@ -1,3 +1,6 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
5
+ +# Sky patch changes:
6
+ +# - enable upscaling_speed to be 0.0
7
+ import copy
8
+ import logging
9
+ import math
10
+ @@ -1071,7 +1074,7 @@
11
+ upscaling_speed = self.config.get("upscaling_speed")
12
+ aggressive = self.config.get("autoscaling_mode") == "aggressive"
13
+ target_utilization_fraction = self.config.get("target_utilization_fraction")
14
+ - if upscaling_speed:
15
+ + if upscaling_speed is not None: # NOTE(sky): enable 0.0
16
+ upscaling_speed = float(upscaling_speed)
17
+ # TODO(ameer): consider adding (if users ask) an option of
18
+ # initial_upscaling_num_workers.
@@ -0,0 +1,19 @@
1
+ --- a/cli.py
2
+ +++ b/cli.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
5
+ +# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
6
+ +# Otherwise, the output redirection ">" will not work.
7
+ +
8
+ import json
9
+ import os
10
+ import sys
11
+ @@ -270,7 +274,7 @@
12
+ working_dir=working_dir,
13
+ )
14
+ job_id = client.submit_job(
15
+ - entrypoint=list2cmdline(entrypoint),
16
+ + entrypoint=" ".join(entrypoint),
17
+ submission_id=submission_id,
18
+ runtime_env=final_runtime_env,
19
+ metadata=metadata_json,
@@ -0,0 +1,17 @@
1
+ --- a/command_runner.py
2
+ +++ b/command_runner.py
3
+ @@ -1,3 +1,5 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
5
+ +
6
+ import hashlib
7
+ import json
8
+ import logging
9
+ @@ -137,7 +139,7 @@
10
+ {
11
+ "ControlMaster": "auto",
12
+ "ControlPath": "{}/%C".format(control_path),
13
+ - "ControlPersist": "10s",
14
+ + "ControlPersist": "300s",
15
+ }
16
+ )
17
+ self.arg_dict.update(kwargs)
@@ -0,0 +1,20 @@
1
+ --- a/log_monitor.py
2
+ +++ b/log_monitor.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
5
+ +# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
6
+ +# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
7
+ +
8
+ import argparse
9
+ import errno
10
+ import glob
11
+ @@ -374,7 +378,8 @@
12
+ next_line = next_line.decode("utf-8", "replace")
13
+ if next_line == "":
14
+ break
15
+ - next_line = next_line.rstrip("\r\n")
16
+ + if next_line.endswith("\n"):
17
+ + next_line = next_line[:-1]
18
+
19
+ if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
20
+ flush() # Possible change of task/actor name.
@@ -0,0 +1,32 @@
1
+ --- a/resource_demand_scheduler.py
2
+ +++ b/resource_demand_scheduler.py
3
+ @@ -1,3 +1,8 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py
5
+ +# Sky patch changes:
6
+ +# - no new nodes are allowed to be launched launched when the upscaling_speed is 0
7
+ +# - comment out "assert not unfulfilled": this seems a buggy assert
8
+ +
9
+ """Implements multi-node-type autoscaling.
10
+
11
+ This file implements an autoscaling algorithm that is aware of multiple node
12
+ @@ -448,7 +453,10 @@
13
+ + placement_group_nodes.get(node_type, 0),
14
+ )
15
+
16
+ - if upper_bound > 0:
17
+ + # NOTE(sky): do not autoscale when upsclaing speed is 0.
18
+ + if self.upscaling_speed == 0:
19
+ + upper_bound = 0
20
+ + if upper_bound >= 0:
21
+ updated_nodes_to_launch[node_type] = min(
22
+ upper_bound, to_launch[node_type]
23
+ )
24
+ @@ -592,7 +600,7 @@
25
+ unfulfilled, including_reserved = get_bin_pack_residual(
26
+ new_node_resources, unfulfilled, strict_spread=True
27
+ )
28
+ - assert not unfulfilled
29
+ + # assert not unfulfilled # NOTE(sky): buggy assert.
30
+ node_resources += including_reserved
31
+ return to_add, node_resources, node_type_counts
32
+
@@ -0,0 +1,18 @@
1
+ --- a/updater.py
2
+ +++ b/updater.py
3
+ @@ -1,3 +1,7 @@
4
+ +# From https://github.com/ray-project/ray/blob/releases/2.9.3/python/ray/autoscaler/_private/updater.py
5
+ +# Sky patch changes:
6
+ +# - Ensure the node state is refreshed before checking the node is terminated.
7
+ +
8
+ import logging
9
+ import os
10
+ import subprocess
11
+ @@ -325,6 +329,7 @@
12
+ )
13
+
14
+ time.sleep(READY_CHECK_INTERVAL)
15
+ + self.provider.non_terminated_nodes({})
16
+
17
+ def do_update(self):
18
+ self.provider.set_node_tags(
@@ -0,0 +1,41 @@
1
+ --- a/worker.py
2
+ +++ b/worker.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py
5
+ +# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233
6
+ +# Tracked in PR https://github.com/ray-project/ray/pull/21977/files.
7
+ +
8
+ import atexit
9
+ import faulthandler
10
+ import functools
11
+ @@ -2020,6 +2024,14 @@
12
+ pid = data.get("pid")
13
+ lines = data.get("lines", [])
14
+
15
+ + def end_for(line: str) -> str:
16
+ + if sys.platform == "win32":
17
+ + return "\n"
18
+ + if line.endswith("\r"):
19
+ + return ""
20
+ + return "\n"
21
+ +
22
+ +
23
+ if data.get("ip") == data.get("localhost"):
24
+ for line in lines:
25
+ if RAY_TQDM_MAGIC in line:
26
+ @@ -2035,6 +2047,7 @@
27
+ message_for(data, line),
28
+ ),
29
+ file=print_file,
30
+ + end=end_for(line),
31
+ )
32
+ else:
33
+ for line in lines:
34
+ @@ -2052,6 +2065,7 @@
35
+ message_for(data, line),
36
+ ),
37
+ file=print_file,
38
+ + end=end_for(line),
39
+ )
40
+ # Restore once at end of batch to avoid excess hiding/unhiding of tqdm.
41
+ restore_tqdm()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250817
3
+ Version: 1.0.0.dev20250819
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0