skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sky/__init__.py +48 -22
  2. sky/adaptors/aws.py +2 -1
  3. sky/adaptors/azure.py +4 -4
  4. sky/adaptors/cloudflare.py +4 -4
  5. sky/adaptors/kubernetes.py +8 -8
  6. sky/authentication.py +42 -45
  7. sky/backends/backend.py +2 -2
  8. sky/backends/backend_utils.py +108 -221
  9. sky/backends/cloud_vm_ray_backend.py +283 -282
  10. sky/benchmark/benchmark_utils.py +6 -2
  11. sky/check.py +40 -28
  12. sky/cli.py +1213 -1116
  13. sky/client/__init__.py +1 -0
  14. sky/client/cli.py +5644 -0
  15. sky/client/common.py +345 -0
  16. sky/client/sdk.py +1757 -0
  17. sky/cloud_stores.py +12 -6
  18. sky/clouds/__init__.py +0 -2
  19. sky/clouds/aws.py +20 -13
  20. sky/clouds/azure.py +5 -3
  21. sky/clouds/cloud.py +1 -1
  22. sky/clouds/cudo.py +2 -1
  23. sky/clouds/do.py +2 -1
  24. sky/clouds/fluidstack.py +3 -2
  25. sky/clouds/gcp.py +10 -8
  26. sky/clouds/ibm.py +8 -7
  27. sky/clouds/kubernetes.py +7 -6
  28. sky/clouds/lambda_cloud.py +8 -7
  29. sky/clouds/oci.py +4 -3
  30. sky/clouds/paperspace.py +2 -1
  31. sky/clouds/runpod.py +2 -1
  32. sky/clouds/scp.py +8 -7
  33. sky/clouds/service_catalog/__init__.py +3 -3
  34. sky/clouds/service_catalog/aws_catalog.py +7 -1
  35. sky/clouds/service_catalog/common.py +4 -2
  36. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
  37. sky/clouds/utils/oci_utils.py +1 -1
  38. sky/clouds/vast.py +2 -1
  39. sky/clouds/vsphere.py +2 -1
  40. sky/core.py +263 -99
  41. sky/dag.py +4 -0
  42. sky/data/mounting_utils.py +2 -1
  43. sky/data/storage.py +97 -35
  44. sky/data/storage_utils.py +69 -9
  45. sky/exceptions.py +138 -5
  46. sky/execution.py +47 -50
  47. sky/global_user_state.py +105 -22
  48. sky/jobs/__init__.py +12 -14
  49. sky/jobs/client/__init__.py +0 -0
  50. sky/jobs/client/sdk.py +296 -0
  51. sky/jobs/constants.py +30 -1
  52. sky/jobs/controller.py +12 -6
  53. sky/jobs/dashboard/dashboard.py +2 -6
  54. sky/jobs/recovery_strategy.py +22 -29
  55. sky/jobs/server/__init__.py +1 -0
  56. sky/jobs/{core.py → server/core.py} +101 -34
  57. sky/jobs/server/dashboard_utils.py +64 -0
  58. sky/jobs/server/server.py +182 -0
  59. sky/jobs/utils.py +32 -23
  60. sky/models.py +27 -0
  61. sky/optimizer.py +9 -11
  62. sky/provision/__init__.py +6 -3
  63. sky/provision/aws/config.py +2 -2
  64. sky/provision/aws/instance.py +1 -1
  65. sky/provision/azure/instance.py +1 -1
  66. sky/provision/cudo/instance.py +1 -1
  67. sky/provision/do/instance.py +1 -1
  68. sky/provision/do/utils.py +0 -5
  69. sky/provision/fluidstack/fluidstack_utils.py +4 -3
  70. sky/provision/fluidstack/instance.py +4 -2
  71. sky/provision/gcp/instance.py +1 -1
  72. sky/provision/instance_setup.py +2 -2
  73. sky/provision/kubernetes/constants.py +8 -0
  74. sky/provision/kubernetes/instance.py +1 -1
  75. sky/provision/kubernetes/utils.py +67 -76
  76. sky/provision/lambda_cloud/instance.py +3 -15
  77. sky/provision/logging.py +1 -1
  78. sky/provision/oci/instance.py +7 -4
  79. sky/provision/paperspace/instance.py +1 -1
  80. sky/provision/provisioner.py +3 -2
  81. sky/provision/runpod/instance.py +1 -1
  82. sky/provision/vast/instance.py +1 -1
  83. sky/provision/vast/utils.py +2 -1
  84. sky/provision/vsphere/instance.py +2 -11
  85. sky/resources.py +55 -40
  86. sky/serve/__init__.py +6 -10
  87. sky/serve/client/__init__.py +0 -0
  88. sky/serve/client/sdk.py +366 -0
  89. sky/serve/constants.py +3 -0
  90. sky/serve/replica_managers.py +10 -10
  91. sky/serve/serve_utils.py +56 -36
  92. sky/serve/server/__init__.py +0 -0
  93. sky/serve/{core.py → server/core.py} +37 -17
  94. sky/serve/server/server.py +117 -0
  95. sky/serve/service.py +8 -1
  96. sky/server/__init__.py +1 -0
  97. sky/server/common.py +441 -0
  98. sky/server/constants.py +21 -0
  99. sky/server/html/log.html +174 -0
  100. sky/server/requests/__init__.py +0 -0
  101. sky/server/requests/executor.py +462 -0
  102. sky/server/requests/payloads.py +481 -0
  103. sky/server/requests/queues/__init__.py +0 -0
  104. sky/server/requests/queues/mp_queue.py +76 -0
  105. sky/server/requests/requests.py +567 -0
  106. sky/server/requests/serializers/__init__.py +0 -0
  107. sky/server/requests/serializers/decoders.py +192 -0
  108. sky/server/requests/serializers/encoders.py +166 -0
  109. sky/server/server.py +1095 -0
  110. sky/server/stream_utils.py +144 -0
  111. sky/setup_files/MANIFEST.in +1 -0
  112. sky/setup_files/dependencies.py +12 -4
  113. sky/setup_files/setup.py +1 -1
  114. sky/sky_logging.py +9 -13
  115. sky/skylet/autostop_lib.py +2 -2
  116. sky/skylet/constants.py +46 -12
  117. sky/skylet/events.py +5 -6
  118. sky/skylet/job_lib.py +78 -66
  119. sky/skylet/log_lib.py +17 -11
  120. sky/skypilot_config.py +79 -94
  121. sky/task.py +119 -73
  122. sky/templates/aws-ray.yml.j2 +4 -4
  123. sky/templates/azure-ray.yml.j2 +3 -2
  124. sky/templates/cudo-ray.yml.j2 +3 -2
  125. sky/templates/fluidstack-ray.yml.j2 +3 -2
  126. sky/templates/gcp-ray.yml.j2 +3 -2
  127. sky/templates/ibm-ray.yml.j2 +3 -2
  128. sky/templates/jobs-controller.yaml.j2 +1 -12
  129. sky/templates/kubernetes-ray.yml.j2 +3 -2
  130. sky/templates/lambda-ray.yml.j2 +3 -2
  131. sky/templates/oci-ray.yml.j2 +3 -2
  132. sky/templates/paperspace-ray.yml.j2 +3 -2
  133. sky/templates/runpod-ray.yml.j2 +3 -2
  134. sky/templates/scp-ray.yml.j2 +3 -2
  135. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  136. sky/templates/vsphere-ray.yml.j2 +4 -2
  137. sky/templates/websocket_proxy.py +64 -0
  138. sky/usage/constants.py +8 -0
  139. sky/usage/usage_lib.py +45 -11
  140. sky/utils/accelerator_registry.py +33 -53
  141. sky/utils/admin_policy_utils.py +2 -1
  142. sky/utils/annotations.py +51 -0
  143. sky/utils/cli_utils/status_utils.py +33 -3
  144. sky/utils/cluster_utils.py +356 -0
  145. sky/utils/command_runner.py +69 -14
  146. sky/utils/common.py +74 -0
  147. sky/utils/common_utils.py +133 -93
  148. sky/utils/config_utils.py +204 -0
  149. sky/utils/control_master_utils.py +2 -3
  150. sky/utils/controller_utils.py +133 -147
  151. sky/utils/dag_utils.py +72 -24
  152. sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
  153. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  154. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  155. sky/utils/log_utils.py +83 -23
  156. sky/utils/message_utils.py +81 -0
  157. sky/utils/registry.py +127 -0
  158. sky/utils/resources_utils.py +2 -2
  159. sky/utils/rich_utils.py +213 -34
  160. sky/utils/schemas.py +19 -2
  161. sky/{status_lib.py → utils/status_lib.py} +12 -7
  162. sky/utils/subprocess_utils.py +51 -35
  163. sky/utils/timeline.py +7 -2
  164. sky/utils/ux_utils.py +95 -25
  165. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
  166. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
  167. sky/clouds/cloud_registry.py +0 -76
  168. sky/utils/cluster_yaml_utils.py +0 -24
  169. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
  170. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
@@ -13,9 +13,9 @@ import requests
13
13
  from sky import sky_logging
14
14
  from sky.adaptors import common as adaptors_common
15
15
  from sky.clouds import cloud as cloud_lib
16
- from sky.clouds import cloud_registry
17
16
  from sky.clouds.service_catalog import constants
18
17
  from sky.utils import common_utils
18
+ from sky.utils import registry
19
19
  from sky.utils import rich_utils
20
20
  from sky.utils import ux_utils
21
21
 
@@ -171,7 +171,9 @@ def read_catalog(filename: str,
171
171
  assert (pull_frequency_hours is None or
172
172
  pull_frequency_hours >= 0), pull_frequency_hours
173
173
  catalog_path = get_catalog_path(filename)
174
- cloud = cloud_registry.CLOUD_REGISTRY.from_str(os.path.dirname(filename))
174
+ cloud = os.path.dirname(filename)
175
+ if cloud != 'common':
176
+ cloud = str(registry.CLOUD_REGISTRY.from_str(cloud))
175
177
 
176
178
  meta_path = os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, '.meta', filename)
177
179
  os.makedirs(os.path.dirname(meta_path), exist_ok=True)
@@ -5,7 +5,6 @@ VMs, GPUs, and TPUs. The script takes about 1-2 minutes to run.
5
5
  """
6
6
 
7
7
  import argparse
8
- import functools
9
8
  import io
10
9
  import multiprocessing
11
10
  import os
@@ -20,6 +19,7 @@ import numpy as np
20
19
 
21
20
  from sky.adaptors import common as adaptors_common
22
21
  from sky.adaptors import gcp
22
+ from sky.utils import annotations
23
23
  from sky.utils import common_utils
24
24
 
25
25
  if typing.TYPE_CHECKING:
@@ -281,7 +281,7 @@ def filter_zones(func: Callable[[], List[str]]) -> Callable[[], List[str]]:
281
281
 
282
282
 
283
283
  @filter_zones
284
- @functools.lru_cache(maxsize=None)
284
+ @annotations.lru_cache(scope='global', maxsize=None)
285
285
  def _get_all_zones() -> List[str]:
286
286
  zones_request = gcp_client.zones().list(project=project_id)
287
287
  zones = []
@@ -17,8 +17,8 @@ import os
17
17
 
18
18
  from sky import sky_logging
19
19
  from sky import skypilot_config
20
- from sky import status_lib
21
20
  from sky.utils import resources_utils
21
+ from sky.utils import status_lib
22
22
 
23
23
  logger = sky_logging.init_logger(__name__)
24
24
 
sky/clouds/vast.py CHANGED
@@ -5,13 +5,14 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
5
 
6
6
  from sky import clouds
7
7
  from sky.clouds import service_catalog
8
+ from sky.utils import registry
8
9
  from sky.utils import resources_utils
9
10
 
10
11
  if typing.TYPE_CHECKING:
11
12
  from sky import resources as resources_lib
12
13
 
13
14
 
14
- @clouds.CLOUD_REGISTRY.register
15
+ @registry.CLOUD_REGISTRY.register
15
16
  class Vast(clouds.Cloud):
16
17
  """ Vast GPU Cloud
17
18
 
sky/clouds/vsphere.py CHANGED
@@ -11,6 +11,7 @@ from sky.provision.vsphere import vsphere_utils
11
11
  from sky.provision.vsphere.vsphere_utils import get_vsphere_credentials
12
12
  from sky.provision.vsphere.vsphere_utils import initialize_vsphere_data
13
13
  from sky.utils import common_utils
14
+ from sky.utils import registry
14
15
  from sky.utils import resources_utils
15
16
 
16
17
  if typing.TYPE_CHECKING:
@@ -24,7 +25,7 @@ _CREDENTIAL_FILES = [
24
25
  ]
25
26
 
26
27
 
27
- @clouds.CLOUD_REGISTRY.register
28
+ @registry.CLOUD_REGISTRY.register
28
29
  class Vsphere(clouds.Cloud):
29
30
  """Vsphere cloud"""
30
31
 
sky/core.py CHANGED
@@ -1,29 +1,38 @@
1
1
  """SDK functions for cluster/job management."""
2
- import getpass
2
+ import os
3
+ import shlex
3
4
  import typing
4
5
  from typing import Any, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  import colorama
7
8
 
8
9
  from sky import backends
10
+ from sky import check as sky_check
9
11
  from sky import clouds
10
12
  from sky import dag
11
13
  from sky import data
12
14
  from sky import exceptions
13
15
  from sky import global_user_state
14
- from sky import jobs as managed_jobs
16
+ from sky import models
15
17
  from sky import sky_logging
16
- from sky import status_lib
17
18
  from sky import task
18
19
  from sky.backends import backend_utils
20
+ from sky.clouds import service_catalog
21
+ from sky.jobs.server import core as managed_jobs_core
22
+ from sky.provision.kubernetes import constants as kubernetes_constants
19
23
  from sky.provision.kubernetes import utils as kubernetes_utils
20
24
  from sky.skylet import constants
21
25
  from sky.skylet import job_lib
26
+ from sky.skylet import log_lib
22
27
  from sky.usage import usage_lib
28
+ from sky.utils import common
29
+ from sky.utils import common_utils
23
30
  from sky.utils import controller_utils
24
31
  from sky.utils import rich_utils
32
+ from sky.utils import status_lib
25
33
  from sky.utils import subprocess_utils
26
34
  from sky.utils import ux_utils
35
+ from sky.utils.kubernetes import kubernetes_deploy_utils
27
36
 
28
37
  if typing.TYPE_CHECKING:
29
38
  from sky import resources as resources_lib
@@ -34,14 +43,15 @@ logger = sky_logging.init_logger(__name__)
34
43
  # = Cluster Management =
35
44
  # ======================
36
45
 
37
- # pylint: disable=redefined-builtin
38
-
39
46
 
40
47
  @usage_lib.entrypoint
41
- def status(cluster_names: Optional[Union[str, List[str]]] = None,
42
- refresh: bool = False) -> List[Dict[str, Any]]:
48
+ def status(
49
+ cluster_names: Optional[Union[str, List[str]]] = None,
50
+ refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
51
+ all_users: bool = False,
52
+ ) -> List[Dict[str, Any]]:
43
53
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
44
- """Get cluster statuses.
54
+ """Gets cluster statuses.
45
55
 
46
56
  If cluster_names is given, return those clusters. Otherwise, return all
47
57
  clusters.
@@ -60,6 +70,10 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
60
70
  'autostop': (int) idle time before autostop,
61
71
  'to_down': (bool) whether autodown is used instead of autostop,
62
72
  'metadata': (dict) metadata of the cluster,
73
+ 'user_hash': (str) user hash of the cluster owner,
74
+ 'user_name': (str) user name of the cluster owner,
75
+ 'resources_str': (str) the resource string representation of the
76
+ cluster,
63
77
  }
64
78
 
65
79
  Each cluster can have one of the following statuses:
@@ -108,16 +122,17 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
108
122
  cluster. If a cluster is found to be terminated or not found, it will
109
123
  be omitted from the returned list.
110
124
  """
111
- return backend_utils.get_clusters(include_controller=True,
112
- refresh=refresh,
113
- cluster_names=cluster_names)
125
+ clusters = backend_utils.get_clusters(refresh=refresh,
126
+ cluster_names=cluster_names,
127
+ all_users=all_users)
128
+ return clusters
114
129
 
115
130
 
116
131
  def status_kubernetes(
117
- ) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
118
- List['kubernetes_utils.KubernetesSkyPilotClusterInfo'], List[Dict[
119
- str, Any]], Optional[str]]:
120
- """Get all SkyPilot clusters and jobs in the Kubernetes cluster.
132
+ ) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
133
+ List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
134
+ List[Dict[str, Any]], Optional[str]]:
135
+ """Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
121
136
 
122
137
  Managed jobs and services are also included in the clusters returned.
123
138
  The caller must parse the controllers to identify which clusters are run
@@ -125,11 +140,11 @@ def status_kubernetes(
125
140
  all_clusters, unmanaged_clusters, all_jobs, context
126
141
  Returns:
127
142
  A tuple containing:
128
- - all_clusters: List of KubernetesSkyPilotClusterInfo with info for
129
- all clusters, including managed jobs, services and controllers.
130
- - unmanaged_clusters: List of KubernetesSkyPilotClusterInfo with info
131
- for all clusters excluding managed jobs and services. Controllers
132
- are included.
143
+ - all_clusters: List of KubernetesSkyPilotClusterInfoPayload with info
144
+ for all clusters, including managed jobs, services and controllers.
145
+ - unmanaged_clusters: List of KubernetesSkyPilotClusterInfoPayload with
146
+ info for all clusters excluding managed jobs and services.
147
+ Controllers are included.
133
148
  - all_jobs: List of managed jobs from all controllers. Each entry is a
134
149
  dictionary job info, see jobs.queue_from_kubernetes_pod for details.
135
150
  - context: Kubernetes context used to fetch the cluster information.
@@ -155,7 +170,7 @@ all_clusters, unmanaged_clusters, all_jobs, context
155
170
  status_message += f's ({i + 1}/{len(jobs_controllers)})'
156
171
  spinner.update(f'{status_message}[/]')
157
172
  try:
158
- job_list = managed_jobs.queue_from_kubernetes_pod(
173
+ job_list = managed_jobs_core.queue_from_kubernetes_pod(
159
174
  pod.metadata.name)
160
175
  except RuntimeError as e:
161
176
  logger.warning('Failed to get managed jobs from controller '
@@ -183,6 +198,14 @@ all_clusters, unmanaged_clusters, all_jobs, context
183
198
  c for c in all_clusters
184
199
  if c.cluster_name not in managed_job_cluster_names
185
200
  ]
201
+ all_clusters = [
202
+ kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
203
+ for c in all_clusters
204
+ ]
205
+ unmanaged_clusters = [
206
+ kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
207
+ for c in unmanaged_clusters
208
+ ]
186
209
  return all_clusters, unmanaged_clusters, all_jobs, context
187
210
 
188
211
 
@@ -253,6 +276,9 @@ def cost_report() -> List[Dict[str, Any]]:
253
276
 
254
277
  for cluster_report in cluster_reports:
255
278
  cluster_report['total_cost'] = get_total_cost(cluster_report)
279
+ cluster_report['cloud'] = str(cluster_report['resources'].cloud)
280
+ cluster_report['accelerators'] = cluster_report[
281
+ 'resources'].accelerators
256
282
 
257
283
  return cluster_reports
258
284
 
@@ -392,10 +418,45 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
392
418
  return message
393
419
 
394
420
 
421
+ @usage_lib.entrypoint
422
+ def down(cluster_name: str, purge: bool = False) -> None:
423
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
424
+ """Tears down a cluster.
425
+
426
+ Tearing down a cluster will delete all associated resources (all billing
427
+ stops), and any data on the attached disks will be lost. Accelerators
428
+ (e.g., TPUs) that are part of the cluster will be deleted too.
429
+
430
+ Args:
431
+ cluster_name: name of the cluster to down.
432
+ purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
433
+ table, even if the actual cluster termination failed on the cloud.
434
+ WARNING: This flag should only be set sparingly in certain manual
435
+ troubleshooting scenarios; with it set, it is the user's
436
+ responsibility to ensure there are no leaked instances and related
437
+ resources.
438
+
439
+ Raises:
440
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
441
+ exist.
442
+ RuntimeError: failed to tear down the cluster.
443
+ sky.exceptions.NotSupportedError: the specified cluster is the managed
444
+ jobs controller.
445
+ """
446
+ handle = global_user_state.get_handle_from_cluster_name(cluster_name)
447
+ if handle is None:
448
+ raise exceptions.ClusterDoesNotExist(
449
+ f'Cluster {cluster_name!r} does not exist.')
450
+
451
+ usage_lib.record_cluster_name_for_current_operation(cluster_name)
452
+ backend = backend_utils.get_backend_from_handle(handle)
453
+ backend.teardown(handle, terminate=True, purge=purge)
454
+
455
+
395
456
  @usage_lib.entrypoint
396
457
  def stop(cluster_name: str, purge: bool = False) -> None:
397
458
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
398
- """Stop a cluster.
459
+ """Stops a cluster.
399
460
 
400
461
  Data on attached disks is not lost when a cluster is stopped. Billing for
401
462
  the instances will stop, while the disks will still be charged. Those
@@ -452,41 +513,6 @@ def stop(cluster_name: str, purge: bool = False) -> None:
452
513
  backend.teardown(handle, terminate=False, purge=purge)
453
514
 
454
515
 
455
- @usage_lib.entrypoint
456
- def down(cluster_name: str, purge: bool = False) -> None:
457
- # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
458
- """Tear down a cluster.
459
-
460
- Tearing down a cluster will delete all associated resources (all billing
461
- stops), and any data on the attached disks will be lost. Accelerators
462
- (e.g., TPUs) that are part of the cluster will be deleted too.
463
-
464
- Args:
465
- cluster_name: name of the cluster to down.
466
- purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
467
- table, even if the actual cluster termination failed on the cloud.
468
- WARNING: This flag should only be set sparingly in certain manual
469
- troubleshooting scenarios; with it set, it is the user's
470
- responsibility to ensure there are no leaked instances and related
471
- resources.
472
-
473
- Raises:
474
- sky.exceptions.ClusterDoesNotExist: the specified cluster does not
475
- exist.
476
- RuntimeError: failed to tear down the cluster.
477
- sky.exceptions.NotSupportedError: the specified cluster is the managed
478
- jobs controller.
479
- """
480
- handle = global_user_state.get_handle_from_cluster_name(cluster_name)
481
- if handle is None:
482
- raise exceptions.ClusterDoesNotExist(
483
- f'Cluster {cluster_name!r} does not exist.')
484
-
485
- usage_lib.record_cluster_name_for_current_operation(cluster_name)
486
- backend = backend_utils.get_backend_from_handle(handle)
487
- backend.teardown(handle, terminate=True, purge=purge)
488
-
489
-
490
516
  @usage_lib.entrypoint
491
517
  def autostop(
492
518
  cluster_name: str,
@@ -494,7 +520,7 @@ def autostop(
494
520
  down: bool = False, # pylint: disable=redefined-outer-name
495
521
  ) -> None:
496
522
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
497
- """Schedule an autostop/autodown for a cluster.
523
+ """Schedules an autostop/autodown for a cluster.
498
524
 
499
525
  Autostop/autodown will automatically stop or teardown a cluster when it
500
526
  becomes idle for a specified duration. Idleness means there are no
@@ -601,7 +627,7 @@ def queue(cluster_name: str,
601
627
  skip_finished: bool = False,
602
628
  all_users: bool = False) -> List[dict]:
603
629
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
604
- """Get the job queue of a cluster.
630
+ """Gets the job queue of a cluster.
605
631
 
606
632
  Please refer to the sky.cli.queue for the document.
607
633
 
@@ -612,6 +638,7 @@ def queue(cluster_name: str,
612
638
  'job_id': (int) job id,
613
639
  'job_name': (str) job name,
614
640
  'username': (str) username,
641
+ 'user_hash': (str) user hash,
615
642
  'submitted_at': (int) timestamp of submitted,
616
643
  'start_at': (int) timestamp of started,
617
644
  'end_at': (int) timestamp of ended,
@@ -632,10 +659,10 @@ def queue(cluster_name: str,
632
659
  exceptions.CommandError: if failed to get the job queue with ssh.
633
660
  """
634
661
  all_jobs = not skip_finished
635
- username: Optional[str] = getpass.getuser()
662
+ user_hash: Optional[str] = common_utils.get_user_hash()
636
663
  if all_users:
637
- username = None
638
- code = job_lib.JobLibCodeGen.get_job_queue(username, all_jobs)
664
+ user_hash = None
665
+ code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
639
666
 
640
667
  handle = backend_utils.check_cluster_available(
641
668
  cluster_name,
@@ -662,19 +689,22 @@ def queue(cluster_name: str,
662
689
  def cancel(
663
690
  cluster_name: str,
664
691
  all: bool = False,
692
+ all_users: bool = False,
665
693
  job_ids: Optional[List[int]] = None,
666
694
  # pylint: disable=invalid-name
695
+ # Internal only:
667
696
  _try_cancel_if_cluster_is_init: bool = False,
668
697
  ) -> None:
669
698
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
670
- """Cancel jobs on a cluster.
699
+ """Cancels jobs on a cluster.
671
700
 
672
701
  Please refer to the sky.cli.cancel for the document.
673
702
 
674
- When `all` is False and `job_ids` is None, cancel the latest running job.
703
+ When none of `job_ids`, `all` and `all_users` is set, cancel the latest
704
+ running job.
675
705
 
676
706
  Additional arguments:
677
- _try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
707
+ try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
678
708
  even if the cluster is not UP, but the head node is still alive.
679
709
  This is used by the jobs controller to cancel the job when the
680
710
  worker node is preempted in the spot cluster.
@@ -693,9 +723,9 @@ def cancel(
693
723
  controller_utils.check_cluster_name_not_controller(
694
724
  cluster_name, operation_str='Cancelling jobs')
695
725
 
696
- if all and job_ids:
697
- raise ValueError('Cannot specify both `all` and `job_ids`. To cancel '
698
- 'all jobs, set `job_ids` to None.')
726
+ if all and job_ids is not None:
727
+ raise exceptions.NotSupportedError(
728
+ 'Cannot specify both --all and job IDs.')
699
729
 
700
730
  # Check the status of the cluster.
701
731
  handle = None
@@ -722,28 +752,32 @@ def cancel(
722
752
 
723
753
  backend = backend_utils.get_backend_from_handle(handle)
724
754
 
725
- if all:
726
- sky_logging.print(f'{colorama.Fore.YELLOW}'
727
- f'Cancelling all jobs on cluster {cluster_name!r}...'
728
- f'{colorama.Style.RESET_ALL}')
729
- elif job_ids is None:
730
- # all = False, job_ids is None => cancel the latest running job.
755
+ if all_users:
731
756
  sky_logging.print(
732
757
  f'{colorama.Fore.YELLOW}'
733
- f'Cancelling latest running job on cluster {cluster_name!r}...'
758
+ f'Cancelling all users\' jobs on cluster {cluster_name!r}...'
734
759
  f'{colorama.Style.RESET_ALL}')
735
- elif job_ids:
736
- # all = False, len(job_ids) > 0 => cancel the specified jobs.
760
+ elif all:
761
+ sky_logging.print(
762
+ f'{colorama.Fore.YELLOW}'
763
+ f'Cancelling all your jobs on cluster {cluster_name!r}...'
764
+ f'{colorama.Style.RESET_ALL}')
765
+ elif job_ids is not None:
737
766
  jobs_str = ', '.join(map(str, job_ids))
738
767
  sky_logging.print(
739
768
  f'{colorama.Fore.YELLOW}'
740
769
  f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...'
741
770
  f'{colorama.Style.RESET_ALL}')
742
771
  else:
743
- # all = False, len(job_ids) == 0 => no jobs to cancel.
744
- return
772
+ sky_logging.print(
773
+ f'{colorama.Fore.YELLOW}'
774
+ f'Cancelling latest running job on cluster {cluster_name!r}...'
775
+ f'{colorama.Style.RESET_ALL}')
745
776
 
746
- backend.cancel_jobs(handle, job_ids, all)
777
+ backend.cancel_jobs(handle,
778
+ job_ids,
779
+ cancel_all=all or all_users,
780
+ user_hash=common_utils.get_user_hash())
747
781
 
748
782
 
749
783
  @usage_lib.entrypoint
@@ -752,7 +786,7 @@ def tail_logs(cluster_name: str,
752
786
  follow: bool = True,
753
787
  tail: int = 0) -> None:
754
788
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
755
- """Tail the logs of a job.
789
+ """Tails the logs of a job.
756
790
 
757
791
  Please refer to the sky.cli.tail_logs for the document.
758
792
 
@@ -774,14 +808,6 @@ def tail_logs(cluster_name: str,
774
808
  )
775
809
  backend = backend_utils.get_backend_from_handle(handle)
776
810
 
777
- job_str = f'job {job_id}'
778
- if job_id is None:
779
- job_str = 'the last job'
780
- sky_logging.print(
781
- f'{colorama.Fore.YELLOW}'
782
- f'Tailing logs of {job_str} on cluster {cluster_name!r}...'
783
- f'{colorama.Style.RESET_ALL}')
784
-
785
811
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
786
812
  backend.tail_logs(handle, job_id, follow=follow, tail=tail)
787
813
 
@@ -792,7 +818,7 @@ def download_logs(
792
818
  job_ids: Optional[List[str]],
793
819
  local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
794
820
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
795
- """Download the logs of jobs.
821
+ """Downloads the logs of jobs.
796
822
 
797
823
  Args:
798
824
  cluster_name: (str) name of the cluster.
@@ -884,7 +910,7 @@ def job_status(cluster_name: str,
884
910
  @usage_lib.entrypoint
885
911
  def storage_ls() -> List[Dict[str, Any]]:
886
912
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
887
- """Get the storages.
913
+ """Gets the storages.
888
914
 
889
915
  Returns:
890
916
  [
@@ -906,7 +932,7 @@ def storage_ls() -> List[Dict[str, Any]]:
906
932
  @usage_lib.entrypoint
907
933
  def storage_delete(name: str) -> None:
908
934
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
909
- """Delete a storage.
935
+ """Deletes a storage.
910
936
 
911
937
  Raises:
912
938
  ValueError: If the storage does not exist.
@@ -915,11 +941,149 @@ def storage_delete(name: str) -> None:
915
941
  handle = global_user_state.get_handle_from_storage_name(name)
916
942
  if handle is None:
917
943
  raise ValueError(f'Storage name {name!r} not found.')
944
+ else:
945
+ storage_object = data.Storage(name=handle.storage_name,
946
+ source=handle.source,
947
+ sync_on_reconstruction=False)
948
+ storage_object.delete()
949
+
918
950
 
919
- assert handle.storage_name == name, (
920
- f'In global_user_state, storage name {name!r} does not match '
921
- f'handle.storage_name {handle.storage_name!r}')
922
- storage_object = data.Storage(name=handle.storage_name,
923
- source=handle.source,
924
- sync_on_reconstruction=False)
925
- storage_object.delete()
951
+ # ===================
952
+ # = Catalog Observe =
953
+ # ===================
954
+ @usage_lib.entrypoint
955
+ def enabled_clouds() -> List[clouds.Cloud]:
956
+ return global_user_state.get_cached_enabled_clouds()
957
+
958
+
959
+ @usage_lib.entrypoint
960
+ def realtime_kubernetes_gpu_availability(
961
+ context: Optional[str] = None,
962
+ name_filter: Optional[str] = None,
963
+ quantity_filter: Optional[int] = None
964
+ ) -> List[models.RealtimeGpuAvailability]:
965
+
966
+ counts, capacity, available = service_catalog.list_accelerator_realtime(
967
+ gpus_only=True,
968
+ clouds='kubernetes',
969
+ name_filter=name_filter,
970
+ region_filter=context,
971
+ quantity_filter=quantity_filter,
972
+ case_sensitive=False)
973
+ assert (set(counts.keys()) == set(capacity.keys()) == set(
974
+ available.keys())), (f'Keys of counts ({list(counts.keys())}), '
975
+ f'capacity ({list(capacity.keys())}), '
976
+ f'and available ({list(available.keys())}) '
977
+ 'must be same.')
978
+ if len(counts) == 0:
979
+ err_msg = 'No GPUs found in Kubernetes cluster. '
980
+ debug_msg = 'To further debug, run: sky check '
981
+ if name_filter is not None:
982
+ gpu_info_msg = f' {name_filter!r}'
983
+ if quantity_filter is not None:
984
+ gpu_info_msg += (' with requested quantity'
985
+ f' {quantity_filter}')
986
+ err_msg = (f'Resources{gpu_info_msg} not found '
987
+ 'in Kubernetes cluster. ')
988
+ debug_msg = ('To show available accelerators on kubernetes,'
989
+ ' run: sky show-gpus --cloud kubernetes ')
990
+ full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
991
+ debug_msg)
992
+ raise ValueError(full_err_msg)
993
+
994
+ realtime_gpu_availability_list: List[models.RealtimeGpuAvailability] = []
995
+
996
+ for gpu, _ in sorted(counts.items()):
997
+ realtime_gpu_availability_list.append(
998
+ models.RealtimeGpuAvailability(
999
+ gpu,
1000
+ counts.pop(gpu),
1001
+ capacity[gpu],
1002
+ available[gpu],
1003
+ ))
1004
+ return realtime_gpu_availability_list
1005
+
1006
+
1007
+ # =================
1008
+ # = Local Cluster =
1009
+ # =================
1010
+ @usage_lib.entrypoint
1011
+ def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
1012
+ ssh_key: Optional[str], cleanup: bool) -> None:
1013
+ """Creates a local or remote cluster."""
1014
+
1015
+ def _validate_args(ips, ssh_user, ssh_key, cleanup):
1016
+ # If any of --ips, --ssh-user, or --ssh-key-path is specified,
1017
+ # all must be specified
1018
+ if bool(ips) or bool(ssh_user) or bool(ssh_key):
1019
+ if not (ips and ssh_user and ssh_key):
1020
+ with ux_utils.print_exception_no_traceback():
1021
+ raise ValueError(
1022
+ 'All ips, ssh_user, and ssh_key must be specified '
1023
+ 'together.')
1024
+
1025
+ # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
1026
+ # are all provided
1027
+ if cleanup and not (ips and ssh_user and ssh_key):
1028
+ with ux_utils.print_exception_no_traceback():
1029
+ raise ValueError(
1030
+ 'cleanup can only be used with ips, ssh_user and ssh_key.')
1031
+
1032
+ _validate_args(ips, ssh_user, ssh_key, cleanup)
1033
+
1034
+ # If remote deployment arguments are specified, run remote up script
1035
+ if ips:
1036
+ assert ssh_user is not None and ssh_key is not None
1037
+ kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
1038
+ cleanup)
1039
+ else:
1040
+ # Run local deployment (kind) if no remote args are specified
1041
+ kubernetes_deploy_utils.deploy_local_cluster(gpus)
1042
+
1043
+
1044
+ def local_down() -> None:
1045
+ """Tears down the Kubernetes cluster started by local_up."""
1046
+ cluster_removed = False
1047
+
1048
+ path_to_package = os.path.dirname(__file__)
1049
+ down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
1050
+ 'delete_cluster.sh')
1051
+
1052
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
1053
+ run_command = shlex.split(down_script_path)
1054
+
1055
+ # Setup logging paths
1056
+ run_timestamp = sky_logging.get_run_timestamp()
1057
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
1058
+ 'local_down.log')
1059
+
1060
+ with rich_utils.safe_status(
1061
+ ux_utils.spinner_message('Removing local cluster',
1062
+ log_path=log_path,
1063
+ is_local=True)):
1064
+
1065
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
1066
+ log_path=log_path,
1067
+ require_outputs=True,
1068
+ stream_logs=False,
1069
+ cwd=cwd)
1070
+ stderr = stderr.replace('No kind clusters found.\n', '')
1071
+
1072
+ if returncode == 0:
1073
+ cluster_removed = True
1074
+ elif returncode == 100:
1075
+ logger.info(ux_utils.error_message('Local cluster does not exist.'))
1076
+ else:
1077
+ with ux_utils.print_exception_no_traceback():
1078
+ raise RuntimeError('Failed to create local cluster. '
1079
+ f'Stdout: {stdout}'
1080
+ f'\nError: {stderr}')
1081
+ if cluster_removed:
1082
+ # Run sky check
1083
+ with rich_utils.safe_status(
1084
+ ux_utils.spinner_message('Running sky check...')):
1085
+ sky_check.check(clouds=['kubernetes'], quiet=True)
1086
+ logger.info(
1087
+ ux_utils.finishing_message('Local cluster removed.',
1088
+ log_path=log_path,
1089
+ is_local=True))
sky/dag.py CHANGED
@@ -76,6 +76,10 @@ class Dag:
76
76
 
77
77
  return out_degree_condition and in_degree_condition
78
78
 
79
+ def validate(self, workdir_only: bool = False):
80
+ for task in self.tasks:
81
+ task.validate(workdir_only=workdir_only)
82
+
79
83
 
80
84
  class _DagContext(threading.local):
81
85
  """A thread-local stack of Dags."""
@@ -117,7 +117,8 @@ def get_az_mount_cmd(container_name: str,
117
117
  if storage_account_key is None:
118
118
  key_env_var = f'AZURE_STORAGE_SAS_TOKEN={shlex.quote(" ")}'
119
119
  else:
120
- key_env_var = f'AZURE_STORAGE_ACCESS_KEY={storage_account_key}'
120
+ key_env_var = ('AZURE_STORAGE_ACCESS_KEY='
121
+ f'{shlex.quote(storage_account_key)}')
121
122
 
122
123
  cache_path = _BLOBFUSE_CACHE_DIR.format(
123
124
  storage_account_name=storage_account_name,