skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +31 -3
  5. sky/backends/cloud_vm_ray_backend.py +22 -29
  6. sky/backends/wheel_utils.py +9 -0
  7. sky/check.py +1 -1
  8. sky/cli.py +253 -74
  9. sky/client/cli.py +253 -74
  10. sky/client/common.py +10 -3
  11. sky/client/sdk.py +11 -8
  12. sky/clouds/aws.py +2 -2
  13. sky/clouds/kubernetes.py +0 -8
  14. sky/clouds/oci.py +1 -1
  15. sky/core.py +17 -11
  16. sky/dashboard/out/404.html +1 -0
  17. sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
  19. sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
  21. sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
  25. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
  37. sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
  38. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
  39. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
  40. sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
  41. sky/dashboard/out/clusters/[cluster].html +1 -0
  42. sky/dashboard/out/clusters.html +1 -0
  43. sky/dashboard/out/favicon.ico +0 -0
  44. sky/dashboard/out/index.html +1 -0
  45. sky/dashboard/out/jobs/[job].html +1 -0
  46. sky/dashboard/out/jobs.html +1 -0
  47. sky/dashboard/out/skypilot.svg +15 -0
  48. sky/dashboard/out/videos/cursor-small.mp4 +0 -0
  49. sky/data/data_transfer.py +2 -1
  50. sky/data/storage.py +24 -14
  51. sky/exceptions.py +5 -0
  52. sky/jobs/constants.py +8 -1
  53. sky/jobs/server/core.py +12 -8
  54. sky/models.py +28 -0
  55. sky/optimizer.py +7 -9
  56. sky/provision/kubernetes/config.py +1 -1
  57. sky/provision/kubernetes/instance.py +16 -14
  58. sky/provision/kubernetes/network_utils.py +1 -1
  59. sky/provision/kubernetes/utils.py +50 -22
  60. sky/provision/provisioner.py +2 -1
  61. sky/resources.py +56 -2
  62. sky/serve/__init__.py +2 -0
  63. sky/serve/autoscalers.py +6 -2
  64. sky/serve/client/sdk.py +61 -0
  65. sky/serve/constants.py +6 -0
  66. sky/serve/load_balancing_policies.py +0 -4
  67. sky/serve/replica_managers.py +6 -8
  68. sky/serve/serve_state.py +0 -6
  69. sky/serve/serve_utils.py +33 -1
  70. sky/serve/server/core.py +192 -7
  71. sky/serve/server/server.py +28 -0
  72. sky/server/common.py +152 -47
  73. sky/server/constants.py +7 -1
  74. sky/server/requests/executor.py +4 -0
  75. sky/server/requests/payloads.py +12 -15
  76. sky/server/requests/serializers/decoders.py +2 -5
  77. sky/server/requests/serializers/encoders.py +2 -5
  78. sky/server/server.py +44 -1
  79. sky/setup_files/MANIFEST.in +1 -0
  80. sky/setup_files/dependencies.py +1 -0
  81. sky/sky_logging.py +12 -2
  82. sky/skylet/constants.py +5 -7
  83. sky/skylet/job_lib.py +3 -3
  84. sky/skypilot_config.py +225 -84
  85. sky/templates/kubernetes-ray.yml.j2 +7 -3
  86. sky/utils/cli_utils/status_utils.py +12 -5
  87. sky/utils/config_utils.py +39 -15
  88. sky/utils/controller_utils.py +44 -7
  89. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  90. sky/utils/kubernetes/gpu_labeler.py +99 -16
  91. sky/utils/schemas.py +24 -0
  92. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
  93. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
  94. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
  95. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ import getpass
6
6
  import os
7
7
  import tempfile
8
8
  import typing
9
- from typing import Any, Dict, Iterable, List, Optional, Set
9
+ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
10
10
  import uuid
11
11
 
12
12
  import colorama
@@ -46,7 +46,7 @@ logger = sky_logging.init_logger(__name__)
46
46
  # controller resources spec.
47
47
  CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = (
48
48
  '{controller_type} controller resources is not valid, please check '
49
- '~/.sky/skyconfig.yaml file and make sure '
49
+ '~/.sky/config.yaml file and make sure '
50
50
  '{controller_type}.controller.resources is a valid resources spec. '
51
51
  'Details:\n {err}')
52
52
 
@@ -72,6 +72,7 @@ class _ControllerSpec:
72
72
  default_hint_if_non_existent: str
73
73
  connection_error_hint: str
74
74
  default_resources_config: Dict[str, Any]
75
+ default_autostop_config: Dict[str, Any]
75
76
 
76
77
  @property
77
78
  def decline_down_when_failed_to_fetch_status_hint(self) -> str:
@@ -118,7 +119,8 @@ class Controllers(enum.Enum):
118
119
  default_hint_if_non_existent='No in-progress managed jobs.',
119
120
  connection_error_hint=(
120
121
  'Failed to connect to jobs controller, please try again later.'),
121
- default_resources_config=managed_job_constants.CONTROLLER_RESOURCES)
122
+ default_resources_config=managed_job_constants.CONTROLLER_RESOURCES,
123
+ default_autostop_config=managed_job_constants.CONTROLLER_AUTOSTOP)
122
124
  SKY_SERVE_CONTROLLER = _ControllerSpec(
123
125
  controller_type='serve',
124
126
  name='serve controller',
@@ -148,7 +150,8 @@ class Controllers(enum.Enum):
148
150
  default_hint_if_non_existent='No live services.',
149
151
  connection_error_hint=(
150
152
  'Failed to connect to serve controller, please try again later.'),
151
- default_resources_config=serve_constants.CONTROLLER_RESOURCES)
153
+ default_resources_config=serve_constants.CONTROLLER_RESOURCES,
154
+ default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
152
155
 
153
156
  @classmethod
154
157
  def from_name(cls, name: Optional[str]) -> Optional['Controllers']:
@@ -262,8 +265,9 @@ def _get_cloud_dependencies_installation_commands(
262
265
  ' ARCH="amd64"; '
263
266
  'fi && '
264
267
  '(command -v kubectl &>/dev/null || '
265
- '("https://dl.k8s.io/release/v1.31.6/bin/linux/$ARCH/kubectl" '
266
- '&& sudo install -o root -g root -m 0755 '
268
+ '(curl -s -LO "https://dl.k8s.io/release/v1.31.6'
269
+ '/bin/linux/$ARCH/kubectl" && '
270
+ 'sudo install -o root -g root -m 0755 '
267
271
  'kubectl /usr/local/bin/kubectl))')
268
272
  elif isinstance(cloud, clouds.Cudo):
269
273
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
@@ -389,7 +393,6 @@ def download_and_stream_latest_job_log(
389
393
  f'Failed to stream the logs for the user program at '
390
394
  f'{log_file}: {common_utils.format_exception(e)}',
391
395
  exc_info=True)
392
- # Return the log_file anyway.
393
396
 
394
397
  return log_file
395
398
 
@@ -601,6 +604,40 @@ def get_controller_resources(
601
604
  return result
602
605
 
603
606
 
607
+ def get_controller_autostop_config(
608
+ controller: Controllers) -> Tuple[Optional[int], bool]:
609
+ """Get the autostop config for the controller.
610
+
611
+ Returns:
612
+ A tuple of (idle_minutes_to_autostop, down), which correspond to the
613
+ values passed to execution.launch().
614
+ """
615
+ controller_autostop_config_copied: Dict[str, Any] = copy.copy(
616
+ controller.value.default_autostop_config)
617
+ if skypilot_config.loaded():
618
+ custom_controller_autostop_config = skypilot_config.get_nested(
619
+ (controller.value.controller_type, 'controller', 'autostop'), None)
620
+ if custom_controller_autostop_config is False:
621
+ # Disabled with `autostop: false` in config.
622
+ # To indicate autostop is disabled, we return None for
623
+ # idle_minutes_to_autostop.
624
+ return None, False
625
+ elif custom_controller_autostop_config is True:
626
+ # Enabled with default values. There is no change in behavior, but
627
+ # this is included by for completeness, since `False` is valid.
628
+ pass
629
+ elif custom_controller_autostop_config is not None:
630
+ # We have specific config values.
631
+ # Override the controller autostop config with the ones specified in
632
+ # the config.
633
+ assert isinstance(custom_controller_autostop_config, dict)
634
+ controller_autostop_config_copied.update(
635
+ custom_controller_autostop_config)
636
+
637
+ return (controller_autostop_config_copied['idle_minutes'],
638
+ controller_autostop_config_copied['down'])
639
+
640
+
604
641
  def _setup_proxy_command_on_controller(
605
642
  controller_launched_cloud: 'clouds.Cloud',
606
643
  user_config: Dict[str, Any]) -> config_utils.Config:
@@ -328,9 +328,9 @@ cp kubeconfig ~/.kube/config
328
328
  # Verify that you can access the cluster
329
329
  kubectl get pods
330
330
 
331
- Also add this to your ~/.sky/skyconfig.yaml to use the new service account:
331
+ Also add this to your ~/.sky/config.yaml to use the new service account:
332
332
 
333
- # ~/.sky/skyconfig.yaml
333
+ # ~/.sky/config.yaml
334
334
  kubernetes:
335
335
  remote_identity: ${SKYPILOT_SA}
336
336
  "
@@ -3,8 +3,9 @@ import argparse
3
3
  import hashlib
4
4
  import os
5
5
  import subprocess
6
- from typing import Optional, Tuple
6
+ from typing import Dict, Optional, Tuple
7
7
 
8
+ import colorama
8
9
  import yaml
9
10
 
10
11
  import sky
@@ -13,6 +14,10 @@ from sky.provision.kubernetes import utils as kubernetes_utils
13
14
  from sky.utils import rich_utils
14
15
 
15
16
 
17
+ def _format_string(str_to_format: str, colorama_format: str) -> str:
18
+ return f'{colorama_format}{str_to_format}{colorama.Style.RESET_ALL}'
19
+
20
+
16
21
  def cleanup(context: Optional[str] = None) -> Tuple[bool, str]:
17
22
  """Deletes all Kubernetes resources created by this script
18
23
 
@@ -45,7 +50,7 @@ def get_node_hash(node_name: str):
45
50
  return md5_hash[:32]
46
51
 
47
52
 
48
- def label(context: Optional[str] = None):
53
+ def label(context: Optional[str] = None, wait_for_completion: bool = True):
49
54
  deletion_success, reason = cleanup(context=context)
50
55
  if not deletion_success:
51
56
  print(reason)
@@ -60,8 +65,10 @@ def label(context: Optional[str] = None):
60
65
  'in their capacity.')
61
66
  return
62
67
 
63
- print(f'Found {len(unlabeled_gpu_nodes)} '
64
- 'unlabeled GPU nodes in the cluster')
68
+ print(
69
+ _format_string(
70
+ f'Found {len(unlabeled_gpu_nodes)} '
71
+ 'unlabeled GPU nodes in the cluster', colorama.Fore.YELLOW))
65
72
 
66
73
  sky_dir = os.path.dirname(sky.__file__)
67
74
  manifest_dir = os.path.join(sky_dir, 'utils/kubernetes')
@@ -80,6 +87,7 @@ def label(context: Optional[str] = None):
80
87
  print('Error setting up GPU labeling: ' + output)
81
88
  return
82
89
 
90
+ jobs_to_node_names: Dict[str, str] = {}
83
91
  with rich_utils.client_status('Creating GPU labeler jobs'):
84
92
  batch_v1 = kubernetes.batch_api(context=context)
85
93
  # Load the job manifest
@@ -113,8 +121,11 @@ def label(context: Optional[str] = None):
113
121
  node_name = node.metadata.name
114
122
 
115
123
  # Modify the job manifest for the current node
116
- job_manifest['metadata']['name'] = ('sky-gpu-labeler-'
117
- f'{get_node_hash(node_name)}')
124
+ job_name = ('sky-gpu-labeler-'
125
+ f'{get_node_hash(node_name)}')
126
+ jobs_to_node_names[job_name] = node_name
127
+ job_manifest['metadata']['name'] = job_name
128
+
118
129
  job_manifest['spec']['template']['spec']['nodeSelector'] = {
119
130
  'kubernetes.io/hostname': node_name
120
131
  }
@@ -122,17 +133,85 @@ def label(context: Optional[str] = None):
122
133
 
123
134
  # Create the job for this node`
124
135
  batch_v1.create_namespaced_job(namespace, job_manifest)
125
- print(f'Created GPU labeler job for node {node_name}')
136
+ print(
137
+ _format_string(f'Created GPU labeler job for node {node_name}',
138
+ colorama.Style.DIM))
126
139
 
127
140
  context_str = f' --context {context}' if context else ''
128
- print(f'GPU labeling started - this may take 10 min or more to complete.'
129
- '\nTo check the status of GPU labeling jobs, run '
130
- f'`kubectl get jobs -n kube-system '
131
- f'-l job=sky-gpu-labeler{context_str}`'
132
- '\nYou can check if nodes have been labeled by running '
133
- f'`kubectl describe nodes{context_str}` '
134
- 'and looking for labels of the format '
135
- '`skypilot.co/accelerator: <gpu_name>`. ')
141
+
142
+ if wait_for_completion:
143
+ # Wait for the job to complete
144
+ with rich_utils.client_status(
145
+ 'Waiting for GPU labeler jobs to complete'):
146
+ success = wait_for_jobs_completion(jobs_to_node_names,
147
+ 'kube-system',
148
+ context=context)
149
+ if success:
150
+ print(
151
+ _format_string('✅ GPU labeling completed successfully',
152
+ colorama.Fore.GREEN))
153
+ else:
154
+ print(_format_string('❌ GPU labeling failed', colorama.Fore.RED))
155
+ cleanup(context=context)
156
+ else:
157
+ print(
158
+ f'GPU labeling started - this may take 10 min or more to complete.'
159
+ '\nTo check the status of GPU labeling jobs, run '
160
+ f'`kubectl get jobs -n kube-system '
161
+ f'-l job=sky-gpu-labeler{context_str}`'
162
+ '\nYou can check if nodes have been labeled by running '
163
+ f'`kubectl describe nodes{context_str}` '
164
+ 'and looking for labels of the format '
165
+ '`skypilot.co/accelerator: <gpu_name>`. ')
166
+
167
+
168
+ def wait_for_jobs_completion(jobs_to_node_names: Dict[str, str],
169
+ namespace: str,
170
+ context: Optional[str] = None,
171
+ timeout: int = 60 * 20):
172
+ """Waits for a Kubernetes Job to complete or fail.
173
+
174
+ Args:
175
+ jobs_to_node_names: A dictionary mapping job names to node names.
176
+ namespace: The namespace the Job is in (default: "default").
177
+ timeout: Timeout in seconds (default: 1200 seconds = 20 minutes).
178
+
179
+ Returns:
180
+ True if the Job completed successfully, False if it failed or timed out.
181
+ """
182
+ batch_v1 = kubernetes.batch_api(context=context)
183
+ w = kubernetes.watch()
184
+ completed_jobs = []
185
+ for event in w.stream(func=batch_v1.list_namespaced_job,
186
+ namespace=namespace,
187
+ timeout_seconds=timeout):
188
+ job = event['object']
189
+ job_name = job.metadata.name
190
+ if job_name in jobs_to_node_names:
191
+ node_name = jobs_to_node_names[job_name]
192
+ if job.status and job.status.completion_time:
193
+ print(
194
+ _format_string(
195
+ f'GPU labeler job for node {node_name} '
196
+ 'completed successfully', colorama.Style.DIM))
197
+ completed_jobs.append(job_name)
198
+ num_remaining_jobs = len(jobs_to_node_names) - len(
199
+ completed_jobs)
200
+ if num_remaining_jobs == 0:
201
+ w.stop()
202
+ return True
203
+ elif job.status and job.status.failed:
204
+ print(
205
+ _format_string(
206
+ f'GPU labeler job for node {node_name} failed',
207
+ colorama.Style.DIM))
208
+ w.stop()
209
+ return False
210
+ print(
211
+ _format_string(
212
+ f'Timed out after waiting {timeout} seconds '
213
+ 'for job to complete', colorama.Style.DIM))
214
+ return False #Timed out
136
215
 
137
216
 
138
217
  def main():
@@ -151,6 +230,10 @@ def main():
151
230
  parser.add_argument('--context',
152
231
  type=str,
153
232
  help='the context to use for the Kubernetes cluster.')
233
+ parser.add_argument('--async',
234
+ dest='async_completion',
235
+ action='store_true',
236
+ help='do not wait for the GPU labeling to complete.')
154
237
  args = parser.parse_args()
155
238
  context = None
156
239
  if args.context:
@@ -165,7 +248,7 @@ def main():
165
248
  if args.cleanup:
166
249
  cleanup(context=context)
167
250
  else:
168
- label(context=context)
251
+ label(context=context, wait_for_completion=not args.async_completion)
169
252
 
170
253
 
171
254
  if __name__ == '__main__':
sky/utils/schemas.py CHANGED
@@ -725,6 +725,29 @@ def get_config_schema():
725
725
  if k != '$schema'
726
726
  }
727
727
  resources_schema['properties'].pop('ports')
728
+ autostop_schema = {
729
+ 'anyOf': [
730
+ {
731
+ # Use boolean to disable autostop completely, e.g.
732
+ # autostop: false
733
+ 'type': 'boolean',
734
+ },
735
+ {
736
+ 'type': 'object',
737
+ 'required': [],
738
+ 'additionalProperties': False,
739
+ 'properties': {
740
+ 'idle_minutes': {
741
+ 'type': 'integer',
742
+ 'minimum': 0,
743
+ },
744
+ 'down': {
745
+ 'type': 'boolean',
746
+ },
747
+ },
748
+ },
749
+ ],
750
+ }
728
751
  controller_resources_schema = {
729
752
  'type': 'object',
730
753
  'required': [],
@@ -736,6 +759,7 @@ def get_config_schema():
736
759
  'additionalProperties': False,
737
760
  'properties': {
738
761
  'resources': resources_schema,
762
+ 'autostop': autostop_schema,
739
763
  }
740
764
  },
741
765
  'bucket': {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250413
3
+ Version: 1.0.0.dev20250421
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -47,6 +47,7 @@ Requires-Dist: python-multipart
47
47
  Requires-Dist: aiofiles
48
48
  Requires-Dist: httpx
49
49
  Requires-Dist: setproctitle
50
+ Requires-Dist: omegaconf<2.5,>=2.4.0dev3
50
51
  Provides-Extra: aws
51
52
  Requires-Dist: urllib3<2; extra == "aws"
52
53
  Requires-Dist: awscli>=1.27.10; extra == "aws"