skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +26 -11
  3. sky/backends/cloud_vm_ray_backend.py +16 -5
  4. sky/client/cli/command.py +222 -4
  5. sky/client/sdk.py +110 -82
  6. sky/clouds/aws.py +10 -7
  7. sky/clouds/azure.py +10 -7
  8. sky/clouds/cloud.py +2 -0
  9. sky/clouds/cudo.py +2 -0
  10. sky/clouds/do.py +10 -7
  11. sky/clouds/fluidstack.py +2 -0
  12. sky/clouds/gcp.py +10 -7
  13. sky/clouds/hyperbolic.py +10 -7
  14. sky/clouds/ibm.py +2 -0
  15. sky/clouds/kubernetes.py +26 -9
  16. sky/clouds/lambda_cloud.py +10 -7
  17. sky/clouds/nebius.py +10 -7
  18. sky/clouds/oci.py +10 -7
  19. sky/clouds/paperspace.py +10 -7
  20. sky/clouds/runpod.py +10 -7
  21. sky/clouds/scp.py +10 -7
  22. sky/clouds/vast.py +10 -7
  23. sky/clouds/vsphere.py +2 -0
  24. sky/core.py +1 -0
  25. sky/dag.py +14 -0
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  30. sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  32. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  37. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  38. sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  54. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  55. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  56. sky/dashboard/out/clusters/[cluster].html +1 -1
  57. sky/dashboard/out/clusters.html +1 -1
  58. sky/dashboard/out/config.html +1 -1
  59. sky/dashboard/out/index.html +1 -1
  60. sky/dashboard/out/infra/[context].html +1 -1
  61. sky/dashboard/out/infra.html +1 -1
  62. sky/dashboard/out/jobs/[job].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -0
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage_utils.py +2 -4
  70. sky/exceptions.py +15 -0
  71. sky/execution.py +5 -0
  72. sky/global_user_state.py +129 -0
  73. sky/jobs/client/sdk.py +13 -11
  74. sky/jobs/server/core.py +4 -0
  75. sky/models.py +16 -0
  76. sky/provision/__init__.py +26 -0
  77. sky/provision/kubernetes/__init__.py +3 -0
  78. sky/provision/kubernetes/instance.py +38 -77
  79. sky/provision/kubernetes/utils.py +52 -2
  80. sky/provision/kubernetes/volume.py +147 -0
  81. sky/resources.py +20 -76
  82. sky/serve/client/sdk.py +13 -13
  83. sky/serve/server/core.py +5 -1
  84. sky/server/common.py +40 -5
  85. sky/server/constants.py +5 -1
  86. sky/server/metrics.py +105 -0
  87. sky/server/requests/executor.py +30 -14
  88. sky/server/requests/payloads.py +16 -0
  89. sky/server/requests/requests.py +35 -1
  90. sky/server/rest.py +152 -0
  91. sky/server/server.py +66 -16
  92. sky/server/state.py +20 -0
  93. sky/server/stream_utils.py +8 -3
  94. sky/server/uvicorn.py +153 -13
  95. sky/setup_files/dependencies.py +2 -0
  96. sky/skylet/constants.py +14 -3
  97. sky/task.py +141 -18
  98. sky/templates/kubernetes-ray.yml.j2 +30 -1
  99. sky/users/permission.py +2 -0
  100. sky/utils/context.py +3 -1
  101. sky/utils/resources_utils.py +66 -0
  102. sky/utils/rich_utils.py +6 -0
  103. sky/utils/schemas.py +146 -3
  104. sky/utils/status_lib.py +10 -0
  105. sky/utils/validator.py +11 -1
  106. sky/volumes/__init__.py +0 -0
  107. sky/volumes/client/__init__.py +0 -0
  108. sky/volumes/client/sdk.py +64 -0
  109. sky/volumes/server/__init__.py +0 -0
  110. sky/volumes/server/core.py +199 -0
  111. sky/volumes/server/server.py +85 -0
  112. sky/volumes/utils.py +158 -0
  113. sky/volumes/volume.py +198 -0
  114. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  115. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
  116. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  124. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  125. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  126. sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  131. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  136. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  137. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  138. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  139. /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
  140. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  141. /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
  142. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  143. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  144. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  145. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,7 @@ import shutil
10
10
  import subprocess
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
13
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
14
14
  from urllib.parse import urlparse
15
15
 
16
16
  import sky
@@ -2734,6 +2734,21 @@ def get_kubernetes_node_info(
2734
2734
  node.metadata.labels.get(label_key))
2735
2735
  break
2736
2736
 
2737
+ # Extract IP address from node addresses (prefer external, fallback to internal)
2738
+ node_ip = None
2739
+ if node.status.addresses:
2740
+ # First try to find external IP
2741
+ for address in node.status.addresses:
2742
+ if address.type == 'ExternalIP':
2743
+ node_ip = address.address
2744
+ break
2745
+ # If no external IP, try to find internal IP
2746
+ if node_ip is None:
2747
+ for address in node.status.addresses:
2748
+ if address.type == 'InternalIP':
2749
+ node_ip = address.address
2750
+ break
2751
+
2737
2752
  allocated_qty = 0
2738
2753
  accelerator_count = get_node_accelerator_count(node.status.allocatable)
2739
2754
 
@@ -2765,7 +2780,8 @@ def get_kubernetes_node_info(
2765
2780
  name=node.metadata.name,
2766
2781
  accelerator_type=accelerator_name,
2767
2782
  total={'accelerator_count': int(accelerator_count)},
2768
- free={'accelerators_available': int(accelerators_available)})
2783
+ free={'accelerators_available': int(accelerators_available)},
2784
+ ip_address=node_ip)
2769
2785
  hint = ''
2770
2786
  if has_multi_host_tpu:
2771
2787
  hint = ('(Note: Multi-host TPUs are detected and excluded from the '
@@ -3281,3 +3297,37 @@ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
3281
3297
 
3282
3298
  format_kubeconfig_exec_auth(config, path)
3283
3299
  return path
3300
+
3301
+
3302
+ def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
3303
+ resource_name: str) -> None:
3304
+ """Helper to delete Kubernetes resources with 404 handling and retries.
3305
+
3306
+ Args:
3307
+ delete_func: Function to call to delete the resource
3308
+ resource_type: Type of resource being deleted (e.g. 'service'),
3309
+ used in logging
3310
+ resource_name: Name of the resource being deleted, used in logging
3311
+ """
3312
+ max_retries = 3
3313
+ retry_delay = 5 # seconds
3314
+
3315
+ for attempt in range(max_retries):
3316
+ try:
3317
+ delete_func()
3318
+ return
3319
+ except kubernetes.api_exception() as e:
3320
+ if e.status == 404:
3321
+ logger.warning(
3322
+ f'terminate_instances: Tried to delete {resource_type} '
3323
+ f'{resource_name}, but the {resource_type} was not '
3324
+ 'found (404).')
3325
+ return
3326
+ elif attempt < max_retries - 1:
3327
+ logger.warning(f'terminate_instances: Failed to delete '
3328
+ f'{resource_type} {resource_name} (attempt '
3329
+ f'{attempt + 1}/{max_retries}). Error: {e}. '
3330
+ f'Retrying in {retry_delay} seconds...')
3331
+ time.sleep(retry_delay)
3332
+ else:
3333
+ raise
@@ -0,0 +1,147 @@
1
+ """Kubernetes pvc provisioning."""
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from sky import models
5
+ from sky import sky_logging
6
+ from sky.adaptors import kubernetes
7
+ from sky.provision.kubernetes import config as config_lib
8
+ from sky.provision.kubernetes import utils as kubernetes_utils
9
+ from sky.volumes import volume as volume_lib
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
15
+ """Gets the context and namespace of a volume."""
16
+ if config.region is None:
17
+ context = kubernetes_utils.get_current_kube_config_context_name()
18
+ config.region = context
19
+ else:
20
+ context = config.region
21
+ namespace = config.config.get('namespace')
22
+ if namespace is None:
23
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
24
+ config.config['namespace'] = namespace
25
+ return context, namespace
26
+
27
+
28
+ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
29
+ pod_spec: Dict[str, Any]) -> None:
30
+ """Checks if the PVC is used by any pod in the namespace."""
31
+ volumes = pod_spec.get('spec', {}).get('volumes', [])
32
+ if not volumes:
33
+ return
34
+ once_modes = [
35
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
36
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
37
+ ]
38
+ for volume in volumes:
39
+ pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
40
+ if not pvc_name:
41
+ continue
42
+ pvc = kubernetes.core_api(
43
+ context).read_namespaced_persistent_volume_claim(
44
+ name=pvc_name, namespace=namespace)
45
+ access_mode = pvc.spec.access_modes[0]
46
+ if access_mode not in once_modes:
47
+ continue
48
+ usedby = _get_volume_usedby(context, namespace, pvc_name)
49
+ if usedby:
50
+ raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
51
+ f'mode {access_mode} is already '
52
+ f'in use by {usedby}.')
53
+
54
+
55
+ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
56
+ """Creates or registers a volume."""
57
+ context, namespace = _get_context_namespace(config)
58
+ pvc_spec = _get_pvc_spec(namespace, config)
59
+ create_persistent_volume_claim(namespace, context, pvc_spec)
60
+ return config
61
+
62
+
63
+ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
64
+ """Deletes a volume."""
65
+ context, namespace = _get_context_namespace(config)
66
+ pvc_name = config.name_on_cloud
67
+ logger.info(f'Deleting PVC {pvc_name}')
68
+ kubernetes_utils.delete_k8s_resource_with_retry(
69
+ delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
70
+ context).delete_namespaced_persistent_volume_claim(
71
+ name=pvc_name,
72
+ namespace=namespace,
73
+ _request_timeout=config_lib.DELETION_TIMEOUT),
74
+ resource_type='pvc',
75
+ resource_name=pvc_name)
76
+ return config
77
+
78
+
79
+ def _get_volume_usedby(context: Optional[str], namespace: str,
80
+ pvc_name: str) -> List[str]:
81
+ """Gets the usedby resources of a volume."""
82
+ usedby = []
83
+ # Get all pods in the namespace
84
+ pods = kubernetes.core_api(context).list_namespaced_pod(namespace=namespace)
85
+ for pod in pods.items:
86
+ if pod.spec.volumes is not None:
87
+ for volume in pod.spec.volumes:
88
+ if volume.persistent_volume_claim is not None:
89
+ if volume.persistent_volume_claim.claim_name == pvc_name:
90
+ usedby.append(pod.metadata.name)
91
+ return usedby
92
+
93
+
94
+ def get_volume_usedby(config: models.VolumeConfig) -> List[str]:
95
+ """Gets the usedby resources of a volume."""
96
+ context, namespace = _get_context_namespace(config)
97
+ pvc_name = config.name_on_cloud
98
+ return _get_volume_usedby(context, namespace, pvc_name)
99
+
100
+
101
+ def create_persistent_volume_claim(namespace: str, context: Optional[str],
102
+ pvc_spec: Dict[str, Any]) -> None:
103
+ """Creates a persistent volume claim for SkyServe controller."""
104
+ pvc_name = pvc_spec['metadata']['name']
105
+ try:
106
+ kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
107
+ name=pvc_name, namespace=namespace)
108
+ logger.debug(f'PVC {pvc_name} already exists')
109
+ return
110
+ except kubernetes.api_exception() as e:
111
+ if e.status != 404: # Not found
112
+ raise
113
+ logger.info(f'Creating PVC {pvc_name}')
114
+ kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
115
+ namespace=namespace, body=pvc_spec)
116
+
117
+
118
+ def _get_pvc_spec(namespace: str,
119
+ config: models.VolumeConfig) -> Dict[str, Any]:
120
+ """Gets the PVC spec for the given storage config."""
121
+ access_mode = config.config.get('access_mode')
122
+ size = config.size
123
+ # The previous code assumes that the access_mode and size are always set.
124
+ assert access_mode is not None
125
+ assert size is not None
126
+ pvc_spec: Dict[str, Any] = {
127
+ 'metadata': {
128
+ 'name': config.name_on_cloud,
129
+ 'namespace': namespace,
130
+ 'labels': {
131
+ 'parent': 'skypilot',
132
+ 'skypilot-name': config.name,
133
+ }
134
+ },
135
+ 'spec': {
136
+ 'accessModes': [access_mode],
137
+ 'resources': {
138
+ 'requests': {
139
+ 'storage': f'{size}Gi'
140
+ }
141
+ },
142
+ }
143
+ }
144
+ storage_class = config.config.get('storage_class_name')
145
+ if storage_class is not None:
146
+ pvc_spec['spec']['storageClassName'] = storage_class
147
+ return pvc_spec
sky/resources.py CHANGED
@@ -30,6 +30,9 @@ from sky.utils import resources_utils
30
30
  from sky.utils import schemas
31
31
  from sky.utils import ux_utils
32
32
 
33
+ if typing.TYPE_CHECKING:
34
+ from sky.volumes import volume as volume_lib
35
+
33
36
  logger = sky_logging.init_logger(__name__)
34
37
 
35
38
  _DEFAULT_DISK_SIZE_GB = 256
@@ -289,7 +292,8 @@ class Resources:
289
292
  self._job_recovery = job_recovery
290
293
 
291
294
  if disk_size is not None:
292
- self._disk_size = int(parse_memory_resource(disk_size, 'disk_size'))
295
+ self._disk_size = int(
296
+ resources_utils.parse_memory_resource(disk_size, 'disk_size'))
293
297
  else:
294
298
  self._disk_size = _DEFAULT_DISK_SIZE_GB
295
299
 
@@ -707,11 +711,11 @@ class Resources:
707
711
  self._memory = None
708
712
  return
709
713
 
710
- memory = parse_memory_resource(str(memory),
711
- 'memory',
712
- ret_type=float,
713
- allow_plus=True,
714
- allow_x=True)
714
+ memory = resources_utils.parse_memory_resource(str(memory),
715
+ 'memory',
716
+ ret_type=float,
717
+ allow_plus=True,
718
+ allow_x=True)
715
719
  self._memory = memory
716
720
  if memory.endswith(('+', 'x')):
717
721
  # 'x' is used internally for make sure our resources used by
@@ -1465,11 +1469,15 @@ class Resources:
1465
1469
  def get_spot_str(self) -> str:
1466
1470
  return '[Spot]' if self.use_spot else ''
1467
1471
 
1468
- def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
1469
- region: clouds.Region,
1470
- zones: Optional[List[clouds.Zone]],
1471
- num_nodes: int,
1472
- dryrun: bool) -> Dict[str, Optional[str]]:
1472
+ def make_deploy_variables(
1473
+ self,
1474
+ cluster_name: resources_utils.ClusterName,
1475
+ region: clouds.Region,
1476
+ zones: Optional[List[clouds.Zone]],
1477
+ num_nodes: int,
1478
+ dryrun: bool,
1479
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
1480
+ ) -> Dict[str, Optional[str]]:
1473
1481
  """Converts planned sky.Resources to resource variables.
1474
1482
 
1475
1483
  These variables are divided into two categories: cloud-specific and
@@ -1491,7 +1499,7 @@ class Resources:
1491
1499
  # Cloud specific variables
1492
1500
  assert self.cloud is not None, 'Cloud must be specified'
1493
1501
  cloud_specific_variables = self.cloud.make_deploy_resources_variables(
1494
- self, cluster_name, region, zones, num_nodes, dryrun)
1502
+ self, cluster_name, region, zones, num_nodes, dryrun, volume_mounts)
1495
1503
 
1496
1504
  # TODO(andyl): Should we print some warnings if users' envs share
1497
1505
  # same names with the cloud specific variables, but not enabled
@@ -2291,67 +2299,3 @@ def parse_time_minutes(time: str) -> int:
2291
2299
  continue
2292
2300
 
2293
2301
  raise ValueError(f'Invalid time format: {time}')
2294
-
2295
-
2296
- def parse_memory_resource(resource_qty_str: Union[str, int, float],
2297
- field_name: str,
2298
- ret_type: type = int,
2299
- unit: str = 'gb',
2300
- allow_plus: bool = False,
2301
- allow_x: bool = False,
2302
- allow_rounding: bool = False) -> str:
2303
- """Returns memory size in chosen units given a resource quantity string.
2304
-
2305
- Args:
2306
- resource_qty_str: Resource quantity string
2307
- unit: Unit to convert to
2308
- allow_plus: Whether to allow '+' prefix
2309
- allow_x: Whether to allow 'x' suffix
2310
- """
2311
- assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
2312
-
2313
- error_msg = f'"{field_name}" field should be a <int><b|k|m|g|t|p><+?>,'\
2314
- f' got {resource_qty_str}'
2315
-
2316
- resource_str = str(resource_qty_str)
2317
-
2318
- # Handle plus and x suffixes, x is only used internally for jobs controller
2319
- plus = ''
2320
- if resource_str.endswith('+'):
2321
- if allow_plus:
2322
- resource_str = resource_str[:-1]
2323
- plus = '+'
2324
- else:
2325
- raise ValueError(error_msg)
2326
-
2327
- x = ''
2328
- if resource_str.endswith('x'):
2329
- if allow_x:
2330
- resource_str = resource_str[:-1]
2331
- x = 'x'
2332
- else:
2333
- raise ValueError(error_msg)
2334
-
2335
- try:
2336
- # We assume it is already in the wanted units to maintain backwards
2337
- # compatibility
2338
- ret_type(resource_str)
2339
- return f'{resource_str}{plus}{x}'
2340
- except ValueError:
2341
- pass
2342
-
2343
- resource_str = resource_str.lower()
2344
- for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
2345
- if resource_str.endswith(mem_unit):
2346
- try:
2347
- value = ret_type(resource_str[:-len(mem_unit)])
2348
- converted = (value * multiplier /
2349
- constants.MEMORY_SIZE_UNITS[unit])
2350
- if not allow_rounding and ret_type(converted) != converted:
2351
- raise ValueError(error_msg)
2352
- converted = ret_type(converted)
2353
- return f'{converted}{plus}{x}'
2354
- except ValueError:
2355
- continue
2356
-
2357
- raise ValueError(error_msg)
sky/serve/client/sdk.py CHANGED
@@ -5,9 +5,9 @@ from typing import List, Optional, Union
5
5
 
6
6
  import click
7
7
 
8
- from sky.adaptors import common as adaptors_common
9
8
  from sky.client import common as client_common
10
9
  from sky.server import common as server_common
10
+ from sky.server import rest
11
11
  from sky.server.requests import payloads
12
12
  from sky.usage import usage_lib
13
13
  from sky.utils import admin_policy_utils
@@ -17,12 +17,8 @@ from sky.utils import dag_utils
17
17
  if typing.TYPE_CHECKING:
18
18
  import io
19
19
 
20
- import requests
21
-
22
20
  import sky
23
21
  from sky.serve import serve_utils
24
- else:
25
- requests = adaptors_common.LazyImport('requests')
26
22
 
27
23
 
28
24
  @context.contextual
@@ -78,7 +74,7 @@ def up(
78
74
  task=dag_str,
79
75
  service_name=service_name,
80
76
  )
81
- response = requests.post(
77
+ response = rest.post(
82
78
  f'{server_common.get_server_url()}/serve/up',
83
79
  json=json.loads(body.model_dump_json()),
84
80
  timeout=(5, None),
@@ -140,7 +136,7 @@ def update(
140
136
  mode=mode,
141
137
  )
142
138
 
143
- response = requests.post(
139
+ response = rest.post(
144
140
  f'{server_common.get_server_url()}/serve/update',
145
141
  json=json.loads(body.model_dump_json()),
146
142
  timeout=(5, None),
@@ -182,7 +178,7 @@ def down(
182
178
  all=all,
183
179
  purge=purge,
184
180
  )
185
- response = requests.post(
181
+ response = rest.post(
186
182
  f'{server_common.get_server_url()}/serve/down',
187
183
  json=json.loads(body.model_dump_json()),
188
184
  timeout=(5, None),
@@ -217,7 +213,7 @@ def terminate_replica(service_name: str, replica_id: int,
217
213
  replica_id=replica_id,
218
214
  purge=purge,
219
215
  )
220
- response = requests.post(
216
+ response = rest.post(
221
217
  f'{server_common.get_server_url()}/serve/terminate-replica',
222
218
  json=json.loads(body.model_dump_json()),
223
219
  timeout=(5, None),
@@ -290,7 +286,7 @@ def status(
290
286
  exceptions.ClusterNotUpError: if the sky serve controller is not up.
291
287
  """
292
288
  body = payloads.ServeStatusBody(service_names=service_names,)
293
- response = requests.post(
289
+ response = rest.post(
294
290
  f'{server_common.get_server_url()}/serve/status',
295
291
  json=json.loads(body.model_dump_json()),
296
292
  timeout=(5, None),
@@ -301,6 +297,7 @@ def status(
301
297
 
302
298
  @usage_lib.entrypoint
303
299
  @server_common.check_server_healthy_or_start
300
+ @rest.retry_on_server_unavailable()
304
301
  def tail_logs(service_name: str,
305
302
  target: Union[str, 'serve_utils.ServiceComponent'],
306
303
  replica_id: Optional[int] = None,
@@ -376,7 +373,7 @@ def tail_logs(service_name: str,
376
373
  replica_id=replica_id,
377
374
  follow=follow,
378
375
  )
379
- response = requests.post(
376
+ response = rest.post(
380
377
  f'{server_common.get_server_url()}/serve/logs',
381
378
  json=json.loads(body.model_dump_json()),
382
379
  timeout=(5, None),
@@ -384,7 +381,10 @@ def tail_logs(service_name: str,
384
381
  cookies=server_common.get_api_cookie_jar(),
385
382
  )
386
383
  request_id = server_common.get_request_id(response)
387
- sdk.stream_response(request_id, response, output_stream)
384
+ return sdk.stream_response(request_id=request_id,
385
+ response=response,
386
+ output_stream=output_stream,
387
+ resumable=True)
388
388
 
389
389
 
390
390
  @usage_lib.entrypoint
@@ -436,7 +436,7 @@ def sync_down_logs(service_name: str,
436
436
  targets=targets,
437
437
  replica_ids=replica_ids,
438
438
  )
439
- response = requests.post(
439
+ response = rest.post(
440
440
  f'{server_common.get_server_url()}/serve/sync-down-logs',
441
441
  json=json.loads(body.model_dump_json()),
442
442
  timeout=(5, None),
sky/serve/server/core.py CHANGED
@@ -28,6 +28,7 @@ from sky.utils import command_runner
28
28
  from sky.utils import common
29
29
  from sky.utils import common_utils
30
30
  from sky.utils import controller_utils
31
+ from sky.utils import dag_utils
31
32
  from sky.utils import rich_utils
32
33
  from sky.utils import subprocess_utils
33
34
  from sky.utils import ux_utils
@@ -139,10 +140,13 @@ def up(
139
140
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
140
141
 
141
142
  serve_utils.validate_service_task(task)
143
+ dag = dag_utils.convert_entrypoint_to_dag(task)
144
+ dag.resolve_and_validate_volumes()
142
145
  # Always apply the policy again here, even though it might have been applied
143
146
  # in the CLI. This is to ensure that we apply the policy to the final DAG
144
147
  # and get the mutated config.
145
- dag, mutated_user_config = admin_policy_utils.apply(task)
148
+ dag, mutated_user_config = admin_policy_utils.apply(dag)
149
+ dag.pre_mount_volumes()
146
150
  task = dag.tasks[0]
147
151
 
148
152
  with rich_utils.safe_status(
sky/server/common.py CHANGED
@@ -9,11 +9,13 @@ import json
9
9
  import os
10
10
  import pathlib
11
11
  import re
12
+ import shutil
12
13
  import subprocess
13
14
  import sys
15
+ import tempfile
14
16
  import time
15
17
  import typing
16
- from typing import Any, Dict, Literal, Optional, Tuple
18
+ from typing import Any, Dict, Literal, Optional, Tuple, Union
17
19
  from urllib import parse
18
20
  import uuid
19
21
 
@@ -27,6 +29,7 @@ from sky import skypilot_config
27
29
  from sky.adaptors import common as adaptors_common
28
30
  from sky.data import data_utils
29
31
  from sky.server import constants as server_constants
32
+ from sky.server import rest
30
33
  from sky.skylet import constants
31
34
  from sky.usage import usage_lib
32
35
  from sky.utils import annotations
@@ -240,9 +243,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
240
243
  server_url = endpoint if endpoint is not None else get_server_url()
241
244
  while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
242
245
  try:
243
- response = requests.get(f'{server_url}/api/health',
244
- timeout=2.5,
245
- cookies=get_api_cookie_jar())
246
+ response = rest.get(f'{server_url}/api/health',
247
+ timeout=2.5,
248
+ cookies=get_api_cookie_jar())
246
249
  except requests.exceptions.Timeout:
247
250
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
248
251
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
@@ -327,6 +330,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
327
330
  def _start_api_server(deploy: bool = False,
328
331
  host: str = '127.0.0.1',
329
332
  foreground: bool = False,
333
+ metrics: bool = False,
334
+ metrics_port: Optional[int] = None,
330
335
  enable_basic_auth: bool = False):
331
336
  """Starts a SkyPilot API server locally."""
332
337
  server_url = get_server_url(host)
@@ -357,10 +362,13 @@ def _start_api_server(deploy: bool = False,
357
362
  args += ['--deploy']
358
363
  if host is not None:
359
364
  args += [f'--host={host}']
365
+ if metrics_port is not None:
366
+ args += [f'--metrics-port={metrics_port}']
360
367
 
361
368
  if foreground:
362
369
  # Replaces the current process with the API server
363
370
  os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
371
+ _set_metrics_env_var(os.environ, metrics, deploy)
364
372
  if enable_basic_auth:
365
373
  os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
366
374
  os.execvp(args[0], args)
@@ -368,6 +376,10 @@ def _start_api_server(deploy: bool = False,
368
376
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
369
377
  os.makedirs(os.path.dirname(log_path), exist_ok=True)
370
378
 
379
+ # For spawn mode, copy the environ to avoid polluting the SDK process.
380
+ server_env = os.environ.copy()
381
+ server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
382
+ _set_metrics_env_var(server_env, metrics, deploy)
371
383
  # Start the API server process in the background and don't wait for it.
372
384
  # If this is called from a CLI invocation, we need
373
385
  # start_new_session=True so that SIGINT on the CLI will not also kill
@@ -437,6 +449,26 @@ def _start_api_server(deploy: bool = False,
437
449
  f'SkyPilot API server started. {dashboard_msg}'))
438
450
 
439
451
 
452
+ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
453
+ deploy: bool):
454
+ """Sets the metrics environment variables.
455
+
456
+ Args:
457
+ env: The environment variables to set.
458
+ metrics: Whether to enable metrics.
459
+ deploy: Whether the server is running in deploy mode, which means
460
+ multiple processes might be running.
461
+ """
462
+ if metrics:
463
+ env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
464
+ if deploy:
465
+ metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
466
+ shutil.rmtree(metrics_dir, ignore_errors=True)
467
+ os.makedirs(metrics_dir, exist_ok=True)
468
+ # Refer to https://prometheus.github.io/client_python/multiprocess/
469
+ env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
470
+
471
+
440
472
  def check_server_healthy(
441
473
  endpoint: Optional[str] = None
442
474
  ) -> Tuple[Literal[
@@ -571,6 +603,8 @@ def get_skypilot_version_on_disk() -> str:
571
603
  def check_server_healthy_or_start_fn(deploy: bool = False,
572
604
  host: str = '127.0.0.1',
573
605
  foreground: bool = False,
606
+ metrics: bool = False,
607
+ metrics_port: Optional[int] = None,
574
608
  enable_basic_auth: bool = False):
575
609
  api_server_status = None
576
610
  try:
@@ -592,7 +626,8 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
592
626
  # have started the server while we were waiting for the lock.
593
627
  api_server_info = get_api_server_status(endpoint)
594
628
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
595
- _start_api_server(deploy, host, foreground, enable_basic_auth)
629
+ _start_api_server(deploy, host, foreground, metrics,
630
+ metrics_port, enable_basic_auth)
596
631
 
597
632
 
598
633
  def check_server_healthy_or_start(func):
sky/server/constants.py CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
7
7
  # API server version, whenever there is a change in API server that requires a
8
8
  # restart of the local API server or error out when the client does not match
9
9
  # the server version.
10
- API_VERSION = '9'
10
+ API_VERSION = '10'
11
11
 
12
12
  # Prefix for API request names.
13
13
  REQUEST_NAME_PREFIX = 'sky.'
@@ -22,6 +22,10 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
22
22
  # background.
23
23
  CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
24
24
 
25
+ # The interval (seconds) for the volume status to be refreshed in the
26
+ # background.
27
+ VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
28
+
25
29
  # Environment variable for a file path to the API cookie file.
26
30
  # Keep in sync with websocket_proxy.py
27
31
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'