skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (113) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +200 -78
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +104 -53
  7. sky/client/sdk.py +13 -5
  8. sky/client/sdk_async.py +4 -2
  9. sky/clouds/kubernetes.py +2 -1
  10. sky/clouds/runpod.py +20 -7
  11. sky/core.py +7 -53
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
  14. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/mounting_utils.py +19 -10
  36. sky/execution.py +4 -2
  37. sky/global_user_state.py +224 -38
  38. sky/jobs/client/sdk.py +10 -1
  39. sky/jobs/controller.py +7 -7
  40. sky/jobs/server/core.py +3 -3
  41. sky/jobs/server/server.py +15 -11
  42. sky/jobs/utils.py +1 -1
  43. sky/logs/agent.py +30 -3
  44. sky/logs/aws.py +9 -19
  45. sky/provision/__init__.py +2 -1
  46. sky/provision/aws/instance.py +2 -1
  47. sky/provision/azure/instance.py +2 -1
  48. sky/provision/cudo/instance.py +2 -2
  49. sky/provision/do/instance.py +2 -2
  50. sky/provision/docker_utils.py +41 -19
  51. sky/provision/fluidstack/instance.py +2 -2
  52. sky/provision/gcp/instance.py +2 -1
  53. sky/provision/hyperbolic/instance.py +2 -1
  54. sky/provision/instance_setup.py +1 -1
  55. sky/provision/kubernetes/instance.py +134 -8
  56. sky/provision/lambda_cloud/instance.py +2 -1
  57. sky/provision/nebius/instance.py +2 -1
  58. sky/provision/oci/instance.py +2 -1
  59. sky/provision/paperspace/instance.py +2 -2
  60. sky/provision/primeintellect/instance.py +2 -2
  61. sky/provision/provisioner.py +1 -0
  62. sky/provision/runpod/instance.py +2 -2
  63. sky/provision/scp/instance.py +2 -2
  64. sky/provision/seeweb/instance.py +2 -1
  65. sky/provision/vast/instance.py +2 -1
  66. sky/provision/vsphere/instance.py +6 -5
  67. sky/schemas/api/responses.py +2 -1
  68. sky/serve/autoscalers.py +2 -0
  69. sky/serve/client/impl.py +45 -19
  70. sky/serve/replica_managers.py +12 -5
  71. sky/serve/serve_utils.py +5 -11
  72. sky/serve/server/core.py +9 -6
  73. sky/serve/server/impl.py +78 -25
  74. sky/serve/server/server.py +4 -5
  75. sky/serve/service_spec.py +33 -0
  76. sky/server/auth/oauth2_proxy.py +2 -2
  77. sky/server/constants.py +1 -1
  78. sky/server/daemons.py +2 -3
  79. sky/server/requests/executor.py +56 -6
  80. sky/server/requests/payloads.py +31 -8
  81. sky/server/requests/preconditions.py +2 -3
  82. sky/server/rest.py +2 -0
  83. sky/server/server.py +28 -19
  84. sky/server/stream_utils.py +34 -12
  85. sky/setup_files/dependencies.py +12 -2
  86. sky/setup_files/setup.py +44 -44
  87. sky/skylet/constants.py +2 -3
  88. sky/templates/kubernetes-ray.yml.j2 +16 -15
  89. sky/usage/usage_lib.py +3 -0
  90. sky/utils/cli_utils/status_utils.py +4 -5
  91. sky/utils/context.py +104 -29
  92. sky/utils/controller_utils.py +7 -6
  93. sky/utils/kubernetes/create_cluster.sh +13 -28
  94. sky/utils/kubernetes/delete_cluster.sh +10 -7
  95. sky/utils/kubernetes/generate_kind_config.py +6 -66
  96. sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
  97. sky/utils/kubernetes_enums.py +5 -0
  98. sky/utils/ux_utils.py +35 -1
  99. sky/utils/yaml_utils.py +9 -0
  100. sky/volumes/client/sdk.py +44 -8
  101. sky/volumes/server/server.py +33 -7
  102. sky/volumes/volume.py +22 -14
  103. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
  104. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
  105. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  109. /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
  110. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
  111. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
  112. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
  113. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
@@ -316,6 +316,9 @@ class StatusBody(RequestBody):
316
316
  all_users: bool = True
317
317
  # TODO (kyuds): default to False post 0.10.5
318
318
  include_credentials: bool = True
319
+ # Only return fields that are needed for the
320
+ # dashboard / CLI summary response
321
+ summary_response: bool = False
319
322
 
320
323
 
321
324
  class StartBody(RequestBody):
@@ -475,6 +478,17 @@ class VolumeListBody(RequestBody):
475
478
  pass
476
479
 
477
480
 
481
+ class VolumeValidateBody(RequestBody):
482
+ """The request body for the volume validate endpoint."""
483
+ name: Optional[str] = None
484
+ volume_type: Optional[str] = None
485
+ infra: Optional[str] = None
486
+ size: Optional[str] = None
487
+ labels: Optional[Dict[str, str]] = None
488
+ resource_name: Optional[str] = None
489
+ config: Optional[Dict[str, Any]] = None
490
+
491
+
478
492
  class EndpointsBody(RequestBody):
479
493
  """The request body for the endpoint."""
480
494
  cluster: str
@@ -669,9 +683,15 @@ class LocalUpBody(RequestBody):
669
683
  ssh_key: Optional[str] = None
670
684
  cleanup: bool = False
671
685
  context_name: Optional[str] = None
686
+ name: Optional[str] = None
672
687
  password: Optional[str] = None
673
688
 
674
689
 
690
+ class LocalDownBody(RequestBody):
691
+ """The request body for the local down endpoint."""
692
+ name: Optional[str] = None
693
+
694
+
675
695
  class SSHUpBody(RequestBody):
676
696
  """The request body for the SSH up/down endpoints."""
677
697
  infra: Optional[str] = None
@@ -709,19 +729,22 @@ class JobsDownloadLogsBody(RequestBody):
709
729
 
710
730
  class JobsPoolApplyBody(RequestBody):
711
731
  """The request body for the jobs pool apply endpoint."""
712
- task: str
732
+ task: Optional[str] = None
733
+ workers: Optional[int] = None
713
734
  pool_name: str
714
735
  mode: serve.UpdateMode
715
736
 
716
737
  def to_kwargs(self) -> Dict[str, Any]:
717
738
  kwargs = super().to_kwargs()
718
- dag = common.process_mounts_in_task_on_api_server(self.task,
719
- self.env_vars,
720
- workdir_only=False)
721
- assert len(
722
- dag.tasks) == 1, ('Must only specify one task in the DAG for '
723
- 'a pool.', dag)
724
- kwargs['task'] = dag.tasks[0]
739
+ if self.task is not None:
740
+ dag = common.process_mounts_in_task_on_api_server(
741
+ self.task, self.env_vars, workdir_only=False)
742
+ assert len(
743
+ dag.tasks) == 1, ('Must only specify one task in the DAG for '
744
+ 'a pool.', dag)
745
+ kwargs['task'] = dag.tasks[0]
746
+ else:
747
+ kwargs['task'] = None
725
748
  return kwargs
726
749
 
727
750
 
@@ -146,10 +146,9 @@ class ClusterStartCompletePrecondition(Precondition):
146
146
  self.cluster_name = cluster_name
147
147
 
148
148
  async def check(self) -> Tuple[bool, Optional[str]]:
149
- cluster_record = global_user_state.get_cluster_from_name(
149
+ cluster_status = global_user_state.get_status_from_cluster_name(
150
150
  self.cluster_name)
151
- if (cluster_record and
152
- cluster_record['status'] is status_lib.ClusterStatus.UP):
151
+ if cluster_status is status_lib.ClusterStatus.UP:
153
152
  # Shortcut for started clusters, ignore cluster not found
154
153
  # since the cluster record might not yet be created by the
155
154
  # launch task.
sky/server/rest.py CHANGED
@@ -9,6 +9,7 @@ import typing
9
9
  from typing import Any, Callable, cast, Optional, TypeVar
10
10
 
11
11
  import colorama
12
+ import urllib3.exceptions
12
13
 
13
14
  from sky import exceptions
14
15
  from sky import sky_logging
@@ -53,6 +54,7 @@ _session.headers[constants.VERSION_HEADER] = (
53
54
  _transient_errors = [
54
55
  requests.exceptions.RequestException,
55
56
  ConnectionError,
57
+ urllib3.exceptions.HTTPError,
56
58
  ]
57
59
 
58
60
 
sky/server/server.py CHANGED
@@ -445,6 +445,22 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
445
445
  loop.call_at(target, tick)
446
446
 
447
447
 
448
+ def schedule_on_boot_check():
449
+ try:
450
+ executor.schedule_request(
451
+ request_id='skypilot-server-on-boot-check',
452
+ request_name='check',
453
+ request_body=payloads.CheckBody(),
454
+ func=sky_check.check,
455
+ schedule_type=requests_lib.ScheduleType.SHORT,
456
+ is_skypilot_system=True,
457
+ )
458
+ except exceptions.RequestAlreadyExistsError:
459
+ # Lifespan will be executed in each uvicorn worker process, we
460
+ # can safely ignore the error if the task is already scheduled.
461
+ logger.debug('Request skypilot-server-on-boot-check already exists.')
462
+
463
+
448
464
  @contextlib.asynccontextmanager
449
465
  async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
450
466
  """FastAPI lifespan context manager."""
@@ -469,6 +485,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
469
485
  # Lifespan will be executed in each uvicorn worker process, we
470
486
  # can safely ignore the error if the task is already scheduled.
471
487
  logger.debug(f'Request {event.id} already exists.')
488
+ schedule_on_boot_check()
472
489
  asyncio.create_task(cleanup_upload_ids())
473
490
  if metrics_utils.METRICS_ENABLED:
474
491
  # Start monitoring the event loop lag in each server worker
@@ -1216,19 +1233,8 @@ async def logs(
1216
1233
  schedule_type=requests_lib.ScheduleType.SHORT,
1217
1234
  request_cluster_name=cluster_job_body.cluster_name,
1218
1235
  )
1219
- task = asyncio.create_task(executor.execute_request_coroutine(request_task))
1220
-
1221
- async def cancel_task():
1222
- try:
1223
- logger.info('Client disconnected for request: '
1224
- f'{request.state.request_id}')
1225
- task.cancel()
1226
- await task
1227
- except asyncio.CancelledError:
1228
- pass
1229
-
1230
- # Cancel the task after the request is done or client disconnects
1231
- background_tasks.add_task(cancel_task)
1236
+ task = executor.execute_request_in_coroutine(request_task)
1237
+ background_tasks.add_task(task.cancel)
1232
1238
  # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
1233
1239
  # the same approach as /stream.
1234
1240
  return stream_utils.stream_response(
@@ -1354,10 +1360,12 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1354
1360
  effective_tail = None if tail is None or tail <= 0 else tail
1355
1361
 
1356
1362
  return fastapi.responses.StreamingResponse(
1357
- content=stream_utils.log_streamer(None,
1358
- log_path,
1359
- tail=effective_tail,
1360
- follow=follow),
1363
+ content=stream_utils.log_streamer(
1364
+ None,
1365
+ log_path,
1366
+ tail=effective_tail,
1367
+ follow=follow,
1368
+ cluster_name=cluster_body.cluster_name),
1361
1369
  media_type='text/plain',
1362
1370
  headers={
1363
1371
  'Cache-Control': 'no-cache, no-transform',
@@ -1419,12 +1427,13 @@ async def local_up(request: fastapi.Request,
1419
1427
 
1420
1428
 
1421
1429
  @app.post('/local_down')
1422
- async def local_down(request: fastapi.Request) -> None:
1430
+ async def local_down(request: fastapi.Request,
1431
+ local_down_body: payloads.LocalDownBody) -> None:
1423
1432
  """Tears down the Kubernetes cluster started by local_up."""
1424
1433
  executor.schedule_request(
1425
1434
  request_id=request.state.request_id,
1426
1435
  request_name='local_down',
1427
- request_body=payloads.RequestBody(),
1436
+ request_body=local_down_body,
1428
1437
  func=core.local_down,
1429
1438
  schedule_type=requests_lib.ScheduleType.LONG,
1430
1439
  )
@@ -8,10 +8,12 @@ from typing import AsyncGenerator, Deque, List, Optional
8
8
  import aiofiles
9
9
  import fastapi
10
10
 
11
+ from sky import global_user_state
11
12
  from sky import sky_logging
12
13
  from sky.server.requests import requests as requests_lib
13
14
  from sky.utils import message_utils
14
15
  from sky.utils import rich_utils
16
+ from sky.utils import status_lib
15
17
 
16
18
  logger = sky_logging.init_logger(__name__)
17
19
 
@@ -22,6 +24,7 @@ logger = sky_logging.init_logger(__name__)
22
24
  _BUFFER_SIZE = 8 * 1024 # 8KB
23
25
  _BUFFER_TIMEOUT = 0.02 # 20ms
24
26
  _HEARTBEAT_INTERVAL = 30
27
+ _CLUSTER_STATUS_INTERVAL = 1
25
28
 
26
29
 
27
30
  async def _yield_log_file_with_payloads_skipped(
@@ -37,11 +40,13 @@ async def _yield_log_file_with_payloads_skipped(
37
40
  yield line_str
38
41
 
39
42
 
40
- async def log_streamer(request_id: Optional[str],
41
- log_path: pathlib.Path,
42
- plain_logs: bool = False,
43
- tail: Optional[int] = None,
44
- follow: bool = True) -> AsyncGenerator[str, None]:
43
+ async def log_streamer(
44
+ request_id: Optional[str],
45
+ log_path: pathlib.Path,
46
+ plain_logs: bool = False,
47
+ tail: Optional[int] = None,
48
+ follow: bool = True,
49
+ cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
45
50
  """Streams the logs of a request.
46
51
 
47
52
  Args:
@@ -51,6 +56,8 @@ async def log_streamer(request_id: Optional[str],
51
56
  plain_logs: Whether to show plain logs.
52
57
  tail: The number of lines to tail. If None, tail the whole file.
53
58
  follow: Whether to follow the log file.
59
+ cluster_name: The cluster name to check status for provision logs.
60
+ If provided and cluster status is UP, streaming will terminate.
54
61
  """
55
62
 
56
63
  if request_id is not None:
@@ -104,15 +111,17 @@ async def log_streamer(request_id: Optional[str],
104
111
 
105
112
  async with aiofiles.open(log_path, 'rb') as f:
106
113
  async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
107
- follow):
114
+ follow, cluster_name):
108
115
  yield chunk
109
116
 
110
117
 
111
- async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
112
- request_id: Optional[str] = None,
113
- plain_logs: bool = False,
114
- tail: Optional[int] = None,
115
- follow: bool = True) -> AsyncGenerator[str, None]:
118
+ async def _tail_log_file(
119
+ f: aiofiles.threadpool.binary.AsyncBufferedReader,
120
+ request_id: Optional[str] = None,
121
+ plain_logs: bool = False,
122
+ tail: Optional[int] = None,
123
+ follow: bool = True,
124
+ cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
116
125
  """Tail the opened log file, buffer the lines and flush in chunks."""
117
126
 
118
127
  if tail is not None:
@@ -128,6 +137,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
128
137
  yield line_str
129
138
 
130
139
  last_heartbeat_time = asyncio.get_event_loop().time()
140
+ last_cluster_status_check_time = asyncio.get_event_loop().time()
131
141
 
132
142
  # Buffer the lines in memory and flush them in chunks to improve log
133
143
  # tailing throughput.
@@ -176,7 +186,19 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
176
186
  break
177
187
  if not follow:
178
188
  break
179
-
189
+ # Provision logs pass in cluster_name, check cluster status
190
+ # periodically to see if provisioning is done. We only
191
+ # check once a second to avoid overloading the DB.
192
+ check_status = (current_time - last_cluster_status_check_time
193
+ ) >= _CLUSTER_STATUS_INTERVAL
194
+ if cluster_name is not None and check_status:
195
+ cluster_record = await (
196
+ global_user_state.get_status_from_cluster_name_async(
197
+ cluster_name))
198
+ if (cluster_record is None or
199
+ cluster_record != status_lib.ClusterStatus.INIT):
200
+ break
201
+ last_cluster_status_check_time = current_time
180
202
  if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
181
203
  # Currently just used to keep the connection busy, refer to
182
204
  # https://github.com/skypilot-org/skypilot/issues/5750 for
@@ -49,8 +49,15 @@ install_requires = [
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
51
  'requests',
52
+ # SkyPilot inherits from uvicorn.Server to customize the behavior of
53
+ # uvicorn, so we need to pin uvicorn version to avoid potential break
54
+ # changes.
55
+ # Notes for current version check:
56
+ # - uvicorn 0.33.0 is the latest version that supports Python 3.8
57
+ # - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
58
+ # behavior.
59
+ 'uvicorn[standard] >=0.33.0, <0.36.0',
52
60
  'fastapi',
53
- 'uvicorn[standard]',
54
61
  # Some pydantic versions are not compatible with ray. Adopted from ray's
55
62
  # setup.py:
56
63
  # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254
@@ -105,6 +112,7 @@ server_dependencies = [
105
112
  GRPC,
106
113
  PROTOBUF,
107
114
  'aiosqlite',
115
+ 'greenlet',
108
116
  ]
109
117
 
110
118
  local_ray = [
@@ -185,7 +193,9 @@ extras_require: Dict[str, List[str]] = {
185
193
  'remote': remote,
186
194
  # For the container registry auth api. Reference:
187
195
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
188
- 'runpod': ['runpod>=1.6.1'],
196
+ # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
197
+ # stdlib provides tomllib; on lower versions we depend on tomli explicitly.
198
+ 'runpod': ['runpod>=1.6.1', 'tomli; python_version < "3.11"'],
189
199
  'fluidstack': [], # No dependencies needed for fluidstack
190
200
  'cudo': ['cudo-compute>=0.1.10'],
191
201
  'paperspace': [], # No dependencies needed for paperspace
sky/setup_files/setup.py CHANGED
@@ -148,47 +148,47 @@ if os.path.exists(readme_filepath):
148
148
  long_description = io.open(readme_filepath, 'r', encoding='utf-8').read()
149
149
  long_description = parse_readme(long_description)
150
150
 
151
- atexit.register(revert_commit_hash)
152
- replace_commit_hash()
153
-
154
- setuptools.setup(
155
- # NOTE: this affects the package.whl wheel name. When changing this (if
156
- # ever), you must grep for '.whl' and change all corresponding wheel paths
157
- # (templates/*.j2 and wheel_utils.py).
158
- name='skypilot-nightly',
159
- version=find_version(),
160
- packages=setuptools.find_packages(),
161
- author='SkyPilot Team',
162
- license='Apache 2.0',
163
- readme='README.md',
164
- description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
165
- long_description=long_description,
166
- long_description_content_type='text/markdown',
167
- setup_requires=['wheel'],
168
- requires_python='>=3.7',
169
- install_requires=dependencies['install_requires'],
170
- extras_require=dependencies['extras_require'],
171
- entry_points={
172
- 'console_scripts': ['sky = sky.cli:cli'],
173
- },
174
- include_package_data=True,
175
- classifiers=[
176
- 'Programming Language :: Python :: 3.7',
177
- 'Programming Language :: Python :: 3.8',
178
- 'Programming Language :: Python :: 3.9',
179
- 'Programming Language :: Python :: 3.10',
180
- 'Programming Language :: Python :: 3.11',
181
- 'Programming Language :: Python :: 3.12',
182
- 'Programming Language :: Python :: 3.13',
183
- 'License :: OSI Approved :: Apache Software License',
184
- 'Operating System :: OS Independent',
185
- 'Topic :: Software Development :: Libraries :: Python Modules',
186
- 'Topic :: System :: Distributed Computing',
187
- ],
188
- project_urls={
189
- 'Homepage': 'https://github.com/skypilot-org/skypilot',
190
- 'Issues': 'https://github.com/skypilot-org/skypilot/issues',
191
- 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
192
- 'Documentation': 'https://docs.skypilot.co/',
193
- },
194
- )
151
+ if __name__ == '__main__':
152
+ atexit.register(revert_commit_hash)
153
+ replace_commit_hash()
154
+ setuptools.setup(
155
+ # NOTE: this affects the package.whl wheel name. When changing this (if
156
+ # ever), you must grep for '.whl' and change all corresponding wheel paths
157
+ # (templates/*.j2 and wheel_utils.py).
158
+ name='skypilot-nightly',
159
+ version=find_version(),
160
+ packages=setuptools.find_packages(),
161
+ author='SkyPilot Team',
162
+ license='Apache 2.0',
163
+ readme='README.md',
164
+ description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
165
+ long_description=long_description,
166
+ long_description_content_type='text/markdown',
167
+ setup_requires=['wheel'],
168
+ requires_python='>=3.7',
169
+ install_requires=dependencies['install_requires'],
170
+ extras_require=dependencies['extras_require'],
171
+ entry_points={
172
+ 'console_scripts': ['sky = sky.cli:cli'],
173
+ },
174
+ include_package_data=True,
175
+ classifiers=[
176
+ 'Programming Language :: Python :: 3.7',
177
+ 'Programming Language :: Python :: 3.8',
178
+ 'Programming Language :: Python :: 3.9',
179
+ 'Programming Language :: Python :: 3.10',
180
+ 'Programming Language :: Python :: 3.11',
181
+ 'Programming Language :: Python :: 3.12',
182
+ 'Programming Language :: Python :: 3.13',
183
+ 'License :: OSI Approved :: Apache Software License',
184
+ 'Operating System :: OS Independent',
185
+ 'Topic :: Software Development :: Libraries :: Python Modules',
186
+ 'Topic :: System :: Distributed Computing',
187
+ ],
188
+ project_urls={
189
+ 'Homepage': 'https://github.com/skypilot-org/skypilot',
190
+ 'Issues': 'https://github.com/skypilot-org/skypilot/issues',
191
+ 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
192
+ 'Documentation': 'https://docs.skypilot.co/',
193
+ },
194
+ )
sky/skylet/constants.py CHANGED
@@ -64,9 +64,8 @@ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
64
64
  'curl -LsSf https://astral.sh/uv/install.sh '
65
65
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
66
66
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
67
- SKY_UV_RUN_CMD: str = (
68
- f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run --active '
69
- '--no-project --no-config')
67
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
68
+ '--no-project --no-config')
70
69
  # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
71
70
  # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
72
71
  # not work when conda is used.
@@ -510,6 +510,16 @@ available_node_types:
510
510
  valueFrom:
511
511
  fieldRef:
512
512
  fieldPath: metadata.labels['ray-node-type']
513
+ - name: SKYPILOT_POD_CPU_CORE_LIMIT
514
+ valueFrom:
515
+ resourceFieldRef:
516
+ containerName: ray-node
517
+ resource: requests.cpu
518
+ - name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
519
+ valueFrom:
520
+ resourceFieldRef:
521
+ containerName: ray-node
522
+ resource: requests.memory
513
523
  {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
514
524
  - name: {{ key }}
515
525
  value: {{ value }}
@@ -630,13 +640,6 @@ available_node_types:
630
640
  command: ["/bin/bash", "-c", "--"]
631
641
  args:
632
642
  - |
633
- # For backwards compatibility, we put a marker file in the pod
634
- # to indicate that the pod is running with the changes introduced
635
- # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
636
- # TODO: Remove this marker file and it's usage in setup_commands
637
- # after v0.10.0 release.
638
- touch /tmp/skypilot_is_nimbus
639
-
640
643
  # Helper function to conditionally use sudo
641
644
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
642
645
  prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
@@ -1333,18 +1336,16 @@ setup_commands:
1333
1336
  # Wait for SSH setup to complete before proceeding
1334
1337
  if [ -f /tmp/apt_ssh_setup_started ]; then
1335
1338
  echo "=== Logs for asynchronous SSH setup ===";
1336
- [ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
1337
- { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1339
+ ([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
1340
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1338
1341
  [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
1339
1342
  fi
1340
1343
 
1341
1344
  echo "=== Logs for asynchronous ray and skypilot installation ===";
1342
- if [ -f /tmp/skypilot_is_nimbus ]; then
1343
- echo "=== Logs for asynchronous ray and skypilot installation ===";
1344
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
1345
- { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1346
- [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1347
- fi
1345
+ ([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
1346
+ { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1347
+ [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1348
+
1348
1349
  end_epoch=$(date +%s);
1349
1350
  echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
1350
1351
  start_epoch=$(date +%s);
sky/usage/usage_lib.py CHANGED
@@ -14,6 +14,7 @@ from typing_extensions import ParamSpec
14
14
 
15
15
  import sky
16
16
  from sky import sky_logging
17
+ from sky import skypilot_config
17
18
  from sky.adaptors import common as adaptors_common
18
19
  from sky.usage import constants
19
20
  from sky.utils import common_utils
@@ -167,6 +168,7 @@ class UsageMessageToReport(MessageToReport):
167
168
  self.runtimes: Dict[str, float] = {} # update_runtime
168
169
  self.exception: Optional[str] = None # entrypoint_context
169
170
  self.stacktrace: Optional[str] = None # entrypoint_context
171
+ self.skypilot_config: Optional[Dict[str, Any]] = None
170
172
 
171
173
  # Whether API server is deployed remotely.
172
174
  self.using_remote_api_server: bool = (
@@ -177,6 +179,7 @@ class UsageMessageToReport(MessageToReport):
177
179
  self.client_entrypoint = common_utils.get_current_client_entrypoint(
178
180
  msg)
179
181
  self.entrypoint = msg
182
+ self.skypilot_config = dict(skypilot_config.to_dict())
180
183
 
181
184
  def set_internal(self):
182
185
  self.internal = True
@@ -11,6 +11,7 @@ from sky.utils import common_utils
11
11
  from sky.utils import log_utils
12
12
  from sky.utils import resources_utils
13
13
  from sky.utils import status_lib
14
+ from sky.utils import ux_utils
14
15
 
15
16
  if typing.TYPE_CHECKING:
16
17
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -105,11 +106,9 @@ def show_status_table(cluster_records: List[responses.StatusResponse],
105
106
 
106
107
  if query_clusters:
107
108
  cluster_names = {record['name'] for record in cluster_records}
108
- not_found_clusters = [
109
- repr(cluster)
110
- for cluster in query_clusters
111
- if cluster not in cluster_names
112
- ]
109
+ not_found_clusters = ux_utils.get_non_matched_query(
110
+ query_clusters, cluster_names)
111
+ not_found_clusters = [repr(cluster) for cluster in not_found_clusters]
113
112
  if not_found_clusters:
114
113
  cluster_str = 'Cluster'
115
114
  if len(not_found_clusters) > 1: