skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/clouds/do.py CHANGED
@@ -14,6 +14,7 @@ from sky.utils import resources_utils
14
14
 
15
15
  if typing.TYPE_CHECKING:
16
16
  from sky import resources as resources_lib
17
+ from sky.volumes import volume as volume_lib
17
18
 
18
19
  _CREDENTIAL_FILE = 'config.yaml'
19
20
 
@@ -175,13 +176,15 @@ class DO(clouds.Cloud):
175
176
  return None
176
177
 
177
178
  def make_deploy_resources_variables(
178
- self,
179
- resources: 'resources_lib.Resources',
180
- cluster_name: resources_utils.ClusterName,
181
- region: 'clouds.Region',
182
- zones: Optional[List['clouds.Zone']],
183
- num_nodes: int,
184
- dryrun: bool = False) -> Dict[str, Optional[str]]:
179
+ self,
180
+ resources: 'resources_lib.Resources',
181
+ cluster_name: resources_utils.ClusterName,
182
+ region: 'clouds.Region',
183
+ zones: Optional[List['clouds.Zone']],
184
+ num_nodes: int,
185
+ dryrun: bool = False,
186
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
187
+ ) -> Dict[str, Optional[str]]:
185
188
  del zones, dryrun, cluster_name
186
189
 
187
190
  resources = resources.assert_launchable()
sky/clouds/fluidstack.py CHANGED
@@ -21,6 +21,7 @@ if typing.TYPE_CHECKING:
21
21
 
22
22
  # Renaming to avoid shadowing variables.
23
23
  from sky import resources as resources_lib
24
+ from sky.volumes import volume as volume_lib
24
25
  else:
25
26
  requests = adaptors_common.LazyImport('requests')
26
27
 
@@ -188,6 +189,7 @@ class Fluidstack(clouds.Cloud):
188
189
  zones: Optional[List[clouds.Zone]],
189
190
  num_nodes: int,
190
191
  dryrun: bool = False,
192
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
191
193
  ) -> Dict[str, Optional[str]]:
192
194
 
193
195
  assert zones is None, 'FluidStack does not support zones.'
sky/clouds/gcp.py CHANGED
@@ -29,6 +29,7 @@ from sky.utils import ux_utils
29
29
  if typing.TYPE_CHECKING:
30
30
  from sky import resources
31
31
  from sky.utils import status_lib
32
+ from sky.volumes import volume as volume_lib
32
33
 
33
34
  logger = sky_logging.init_logger(__name__)
34
35
 
@@ -465,13 +466,15 @@ class GCP(clouds.Cloud):
465
466
  assert False, 'Low disk tier should always be supported on GCP.'
466
467
 
467
468
  def make_deploy_resources_variables(
468
- self,
469
- resources: 'resources.Resources',
470
- cluster_name: resources_utils.ClusterName,
471
- region: 'clouds.Region',
472
- zones: Optional[List['clouds.Zone']],
473
- num_nodes: int,
474
- dryrun: bool = False) -> Dict[str, Optional[str]]:
469
+ self,
470
+ resources: 'resources.Resources',
471
+ cluster_name: resources_utils.ClusterName,
472
+ region: 'clouds.Region',
473
+ zones: Optional[List['clouds.Zone']],
474
+ num_nodes: int,
475
+ dryrun: bool = False,
476
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
477
+ ) -> Dict[str, Optional[str]]:
475
478
  assert zones is not None, (region, zones)
476
479
 
477
480
  region_name = region.name
sky/clouds/hyperbolic.py CHANGED
@@ -13,6 +13,7 @@ from sky.utils.resources_utils import DiskTier
13
13
 
14
14
  if typing.TYPE_CHECKING:
15
15
  from sky import resources as resources_lib
16
+ from sky.volumes import volume as volume_lib
16
17
 
17
18
 
18
19
  @registry.CLOUD_REGISTRY.register
@@ -244,13 +245,15 @@ class Hyperbolic(clouds.Cloud):
244
245
  return 0.0
245
246
 
246
247
  def make_deploy_resources_variables(
247
- self,
248
- resources: 'resources_lib.Resources',
249
- cluster_name: resources_utils.ClusterName,
250
- region: 'clouds.Region',
251
- zones: Optional[List['clouds.Zone']],
252
- num_nodes: int,
253
- dryrun: bool = False) -> Dict[str, Any]:
248
+ self,
249
+ resources: 'resources_lib.Resources',
250
+ cluster_name: resources_utils.ClusterName,
251
+ region: 'clouds.Region',
252
+ zones: Optional[List['clouds.Zone']],
253
+ num_nodes: int,
254
+ dryrun: bool = False,
255
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
256
+ ) -> Dict[str, Any]:
254
257
  """Returns a dict of variables for the deployment template."""
255
258
  del dryrun, region, cluster_name # unused
256
259
  assert zones is None, ('Hyperbolic does not support zones', zones)
sky/clouds/ibm.py CHANGED
@@ -18,6 +18,7 @@ from sky.utils import ux_utils
18
18
  if typing.TYPE_CHECKING:
19
19
  # renaming to avoid shadowing variables
20
20
  from sky import resources as resources_lib
21
+ from sky.volumes import volume as volume_lib
21
22
 
22
23
  logger = sky_logging.init_logger(__name__)
23
24
 
@@ -175,6 +176,7 @@ class IBM(clouds.Cloud):
175
176
  zones: Optional[List['clouds.Zone']],
176
177
  num_nodes: int,
177
178
  dryrun: bool = False,
179
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
178
180
  ) -> Dict[str, Any]:
179
181
  """Converts planned sky.Resources to cloud-specific resource variables.
180
182
 
sky/clouds/kubernetes.py CHANGED
@@ -25,6 +25,7 @@ from sky.utils import common_utils
25
25
  from sky.utils import registry
26
26
  from sky.utils import resources_utils
27
27
  from sky.utils import schemas
28
+ from sky.volumes import volume as volume_lib
28
29
 
29
30
  if typing.TYPE_CHECKING:
30
31
  # Renaming to avoid shadowing variables.
@@ -394,7 +395,9 @@ class Kubernetes(clouds.Cloud):
394
395
  return 0
395
396
 
396
397
  @staticmethod
397
- def _calculate_provision_timeout(num_nodes: int) -> int:
398
+ def _calculate_provision_timeout(
399
+ num_nodes: int,
400
+ volume_mounts: Optional[List['volume_lib.VolumeMount']]) -> int:
398
401
  """Calculate provision timeout based on number of nodes.
399
402
 
400
403
  The timeout scales linearly with the number of nodes to account for
@@ -409,19 +412,33 @@ class Kubernetes(clouds.Cloud):
409
412
  base_timeout = 10 # Base timeout for single node
410
413
  per_node_timeout = 0.2 # Additional seconds per node
411
414
  max_timeout = 60 # Cap at 1 minute
415
+ if volume_mounts is not None:
416
+ for volume_mount in volume_mounts:
417
+ if (volume_mount.volume_config.type ==
418
+ volume_lib.VolumeType.PVC.value):
419
+ if (volume_mount.volume_config.config.get(
420
+ 'access_mode', '') ==
421
+ volume_lib.VolumeAccessMode.READ_WRITE_MANY.value):
422
+ # GKE may take several minutes to provision a PV
423
+ # supporting READ_WRITE_MANY with filestore.
424
+ base_timeout = 180
425
+ max_timeout = 240
426
+ break
412
427
 
413
428
  return int(
414
429
  min(base_timeout + (per_node_timeout * (num_nodes - 1)),
415
430
  max_timeout))
416
431
 
417
432
  def make_deploy_resources_variables(
418
- self,
419
- resources: 'resources_lib.Resources',
420
- cluster_name: 'resources_utils.ClusterName',
421
- region: Optional['clouds.Region'],
422
- zones: Optional[List['clouds.Zone']],
423
- num_nodes: int,
424
- dryrun: bool = False) -> Dict[str, Optional[str]]:
433
+ self,
434
+ resources: 'resources_lib.Resources',
435
+ cluster_name: 'resources_utils.ClusterName',
436
+ region: Optional['clouds.Region'],
437
+ zones: Optional[List['clouds.Zone']],
438
+ num_nodes: int,
439
+ dryrun: bool = False,
440
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
441
+ ) -> Dict[str, Optional[str]]:
425
442
  del cluster_name, zones, dryrun # Unused.
426
443
  if region is None:
427
444
  context = kubernetes_utils.get_current_kube_config_context_name()
@@ -562,7 +579,7 @@ class Kubernetes(clouds.Cloud):
562
579
  # We use a linear scaling formula to determine the timeout based on the
563
580
  # number of nodes.
564
581
 
565
- timeout = self._calculate_provision_timeout(num_nodes)
582
+ timeout = self._calculate_provision_timeout(num_nodes, volume_mounts)
566
583
  timeout = skypilot_config.get_nested(
567
584
  ('kubernetes', 'provision_timeout'),
568
585
  timeout,
@@ -653,6 +670,7 @@ class Kubernetes(clouds.Cloud):
653
670
  (constants.PERSISTENT_RUN_SCRIPT_DIR),
654
671
  'k8s_high_availability_restarting_signal_file':
655
672
  (constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE),
673
+ 'ha_recovery_log_path': constants.HA_PERSISTENT_RECOVERY_LOG_PATH,
656
674
  'sky_python_cmd': constants.SKY_PYTHON_CMD,
657
675
  'k8s_high_availability_storage_class_name':
658
676
  (k8s_ha_storage_class_name),
@@ -15,6 +15,7 @@ if typing.TYPE_CHECKING:
15
15
 
16
16
  # Renaming to avoid shadowing variables.
17
17
  from sky import resources as resources_lib
18
+ from sky.volumes import volume as volume_lib
18
19
  else:
19
20
  requests = adaptors_common.LazyImport('requests')
20
21
 
@@ -159,13 +160,15 @@ class Lambda(clouds.Cloud):
159
160
  return None
160
161
 
161
162
  def make_deploy_resources_variables(
162
- self,
163
- resources: 'resources_lib.Resources',
164
- cluster_name: 'resources_utils.ClusterName',
165
- region: 'clouds.Region',
166
- zones: Optional[List['clouds.Zone']],
167
- num_nodes: int,
168
- dryrun: bool = False) -> Dict[str, Any]:
163
+ self,
164
+ resources: 'resources_lib.Resources',
165
+ cluster_name: 'resources_utils.ClusterName',
166
+ region: 'clouds.Region',
167
+ zones: Optional[List['clouds.Zone']],
168
+ num_nodes: int,
169
+ dryrun: bool = False,
170
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
171
+ ) -> Dict[str, Any]:
169
172
  del cluster_name, dryrun # Unused.
170
173
  assert zones is None, 'Lambda does not support zones.'
171
174
  resources = resources.assert_launchable()
sky/clouds/nebius.py CHANGED
@@ -16,6 +16,7 @@ from sky.utils import resources_utils
16
16
 
17
17
  if typing.TYPE_CHECKING:
18
18
  from sky import resources as resources_lib
19
+ from sky.volumes import volume as volume_lib
19
20
 
20
21
  _INDENT_PREFIX = ' '
21
22
 
@@ -196,13 +197,15 @@ class Nebius(clouds.Cloud):
196
197
  return None
197
198
 
198
199
  def make_deploy_resources_variables(
199
- self,
200
- resources: 'resources_lib.Resources',
201
- cluster_name: resources_utils.ClusterName,
202
- region: 'clouds.Region',
203
- zones: Optional[List['clouds.Zone']],
204
- num_nodes: int,
205
- dryrun: bool = False) -> Dict[str, Any]:
200
+ self,
201
+ resources: 'resources_lib.Resources',
202
+ cluster_name: resources_utils.ClusterName,
203
+ region: 'clouds.Region',
204
+ zones: Optional[List['clouds.Zone']],
205
+ num_nodes: int,
206
+ dryrun: bool = False,
207
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
208
+ ) -> Dict[str, Any]:
206
209
  del dryrun, cluster_name
207
210
  assert zones is None, ('Nebius does not support zones', zones)
208
211
 
sky/clouds/oci.py CHANGED
@@ -40,6 +40,7 @@ from sky.utils import ux_utils
40
40
  if typing.TYPE_CHECKING:
41
41
  # Renaming to avoid shadowing variables.
42
42
  from sky import resources as resources_lib
43
+ from sky.volumes import volume as volume_lib
43
44
 
44
45
  logger = logging.getLogger(__name__)
45
46
 
@@ -207,13 +208,15 @@ class OCI(clouds.Cloud):
207
208
  return None
208
209
 
209
210
  def make_deploy_resources_variables(
210
- self,
211
- resources: 'resources_lib.Resources',
212
- cluster_name: resources_utils.ClusterName,
213
- region: Optional['clouds.Region'],
214
- zones: Optional[List['clouds.Zone']],
215
- num_nodes: int,
216
- dryrun: bool = False) -> Dict[str, Any]:
211
+ self,
212
+ resources: 'resources_lib.Resources',
213
+ cluster_name: resources_utils.ClusterName,
214
+ region: Optional['clouds.Region'],
215
+ zones: Optional[List['clouds.Zone']],
216
+ num_nodes: int,
217
+ dryrun: bool = False,
218
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
219
+ ) -> Dict[str, Any]:
217
220
  del cluster_name, dryrun # Unused.
218
221
  assert region is not None, resources
219
222
 
sky/clouds/paperspace.py CHANGED
@@ -14,6 +14,7 @@ if typing.TYPE_CHECKING:
14
14
  import requests
15
15
 
16
16
  from sky import resources as resources_lib
17
+ from sky.volumes import volume as volume_lib
17
18
  else:
18
19
  requests = adaptors_common.LazyImport('requests')
19
20
 
@@ -179,13 +180,15 @@ class Paperspace(clouds.Cloud):
179
180
  return None
180
181
 
181
182
  def make_deploy_resources_variables(
182
- self,
183
- resources: 'resources_lib.Resources',
184
- cluster_name: resources_utils.ClusterName,
185
- region: 'clouds.Region',
186
- zones: Optional[List['clouds.Zone']],
187
- num_nodes: int,
188
- dryrun: bool = False) -> Dict[str, Optional[str]]:
183
+ self,
184
+ resources: 'resources_lib.Resources',
185
+ cluster_name: resources_utils.ClusterName,
186
+ region: 'clouds.Region',
187
+ zones: Optional[List['clouds.Zone']],
188
+ num_nodes: int,
189
+ dryrun: bool = False,
190
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
191
+ ) -> Dict[str, Optional[str]]:
189
192
  del zones, dryrun, cluster_name
190
193
 
191
194
  resources = resources.assert_launchable()
sky/clouds/runpod.py CHANGED
@@ -10,6 +10,7 @@ from sky.utils import resources_utils
10
10
 
11
11
  if typing.TYPE_CHECKING:
12
12
  from sky import resources as resources_lib
13
+ from sky.volumes import volume as volume_lib
13
14
 
14
15
  _CREDENTIAL_FILES = [
15
16
  'config.toml',
@@ -160,13 +161,15 @@ class RunPod(clouds.Cloud):
160
161
  return None
161
162
 
162
163
  def make_deploy_resources_variables(
163
- self,
164
- resources: 'resources_lib.Resources',
165
- cluster_name: resources_utils.ClusterName,
166
- region: 'clouds.Region',
167
- zones: Optional[List['clouds.Zone']],
168
- num_nodes: int,
169
- dryrun: bool = False) -> Dict[str, Optional[Union[str, bool]]]:
164
+ self,
165
+ resources: 'resources_lib.Resources',
166
+ cluster_name: resources_utils.ClusterName,
167
+ region: 'clouds.Region',
168
+ zones: Optional[List['clouds.Zone']],
169
+ num_nodes: int,
170
+ dryrun: bool = False,
171
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
172
+ ) -> Dict[str, Optional[Union[str, bool]]]:
170
173
  del dryrun, cluster_name # unused
171
174
  assert zones is not None, (region, zones)
172
175
 
sky/clouds/scp.py CHANGED
@@ -19,6 +19,7 @@ from sky.utils import status_lib
19
19
  if typing.TYPE_CHECKING:
20
20
  # Renaming to avoid shadowing variables.
21
21
  from sky import resources as resources_lib
22
+ from sky.volumes import volume as volume_lib
22
23
 
23
24
  _CREDENTIAL_FILES = [
24
25
  'scp_credential',
@@ -183,13 +184,15 @@ class SCP(clouds.Cloud):
183
184
  return None
184
185
 
185
186
  def make_deploy_resources_variables(
186
- self,
187
- resources: 'resources_lib.Resources',
188
- cluster_name: 'resources_utils.ClusterName',
189
- region: 'clouds.Region',
190
- zones: Optional[List['clouds.Zone']],
191
- num_nodes: int,
192
- dryrun: bool = False) -> Dict[str, Optional[str]]:
187
+ self,
188
+ resources: 'resources_lib.Resources',
189
+ cluster_name: 'resources_utils.ClusterName',
190
+ region: 'clouds.Region',
191
+ zones: Optional[List['clouds.Zone']],
192
+ num_nodes: int,
193
+ dryrun: bool = False,
194
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
195
+ ) -> Dict[str, Optional[str]]:
193
196
  del cluster_name, dryrun # Unused.
194
197
  assert zones is None, 'SCP does not support zones.'
195
198
 
sky/clouds/vast.py CHANGED
@@ -10,6 +10,7 @@ from sky.utils import resources_utils
10
10
 
11
11
  if typing.TYPE_CHECKING:
12
12
  from sky import resources as resources_lib
13
+ from sky.volumes import volume as volume_lib
13
14
 
14
15
 
15
16
  @registry.CLOUD_REGISTRY.register
@@ -155,13 +156,15 @@ class Vast(clouds.Cloud):
155
156
  return None
156
157
 
157
158
  def make_deploy_resources_variables(
158
- self,
159
- resources: 'resources_lib.Resources',
160
- cluster_name: resources_utils.ClusterName,
161
- region: 'clouds.Region',
162
- zones: Optional[List['clouds.Zone']],
163
- num_nodes: int,
164
- dryrun: bool = False) -> Dict[str, Optional[str]]:
159
+ self,
160
+ resources: 'resources_lib.Resources',
161
+ cluster_name: resources_utils.ClusterName,
162
+ region: 'clouds.Region',
163
+ zones: Optional[List['clouds.Zone']],
164
+ num_nodes: int,
165
+ dryrun: bool = False,
166
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
167
+ ) -> Dict[str, Optional[str]]:
165
168
  del zones, dryrun, cluster_name, num_nodes # unused
166
169
 
167
170
  resources = resources.assert_launchable()
sky/clouds/vsphere.py CHANGED
@@ -18,6 +18,7 @@ if typing.TYPE_CHECKING:
18
18
 
19
19
  # Renaming to avoid shadowing variables.
20
20
  from sky import resources as resources_lib
21
+ from sky.volumes import volume as volume_lib
21
22
  else:
22
23
  requests = adaptors_common.LazyImport('requests')
23
24
 
@@ -184,6 +185,7 @@ class Vsphere(clouds.Cloud):
184
185
  zones: Optional[List['clouds.Zone']],
185
186
  num_nodes: int,
186
187
  dryrun: bool = False,
188
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
187
189
  ) -> Dict[str, Optional[str]]:
188
190
  # TODO get image id here.
189
191
  del cluster_name, dryrun # unused
sky/core.py CHANGED
@@ -33,6 +33,7 @@ from sky.utils import admin_policy_utils
33
33
  from sky.utils import common
34
34
  from sky.utils import common_utils
35
35
  from sky.utils import controller_utils
36
+ from sky.utils import resources_utils
36
37
  from sky.utils import rich_utils
37
38
  from sky.utils import status_lib
38
39
  from sky.utils import subprocess_utils
@@ -75,6 +76,7 @@ def optimize(
75
76
  for a task.
76
77
  exceptions.NoCloudAccessError: if no public clouds are enabled.
77
78
  """
79
+ dag.resolve_and_validate_volumes()
78
80
  # TODO: We apply the admin policy only on the first DAG optimization which
79
81
  # is shown on `sky launch`. The optimizer is also invoked during failover,
80
82
  # but we do not apply the admin policy there. We should apply the admin
@@ -265,7 +267,7 @@ def endpoints(cluster: str,
265
267
  the dictionary will contain all ports:endpoints exposed on the cluster.
266
268
 
267
269
  Raises:
268
- ValueError: if the cluster is not UP or the endpoint is not exposed.
270
+ ValueError: if the cluster is not UP or the endpoint is not exposed.
269
271
  RuntimeError: if the cluster has no ports to be exposed or no endpoints
270
272
  are exposed yet.
271
273
  """
@@ -276,7 +278,7 @@ def endpoints(cluster: str,
276
278
 
277
279
 
278
280
  @usage_lib.entrypoint
279
- def cost_report() -> List[Dict[str, Any]]:
281
+ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
280
282
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
281
283
  """Get all cluster cost reports, including those that have been downed.
282
284
 
@@ -294,6 +296,13 @@ def cost_report() -> List[Dict[str, Any]]:
294
296
  'cluster_hash': (str) unique hash identifying cluster,
295
297
  'usage_intervals': (List[Tuple[int, int]]) cluster usage times,
296
298
  'total_cost': (float) cost given resources and usage intervals,
299
+ 'cloud': (str) cloud of the cluster,
300
+ 'region': (str) region of the cluster,
301
+ 'cpus': (str) number of vCPUs of the cluster,
302
+ 'memory': (str) memory of the cluster,
303
+ 'accelerators': (str) accelerators of the cluster,
304
+ 'resources_str': (str) resources string of the cluster,
305
+ 'resources_str_full': (str) full resources string of the cluster,
297
306
  }
298
307
 
299
308
  The estimated cost column indicates price for the cluster based on the type
@@ -303,27 +312,92 @@ def cost_report() -> List[Dict[str, Any]]:
303
312
  cache of the cluster status, and may not be accurate for the cluster with
304
313
  autostop/use_spot set or terminated/stopped on the cloud console.
305
314
 
315
+ Args:
316
+ days: Number of days to look back from now. Active clusters are always
317
+ included. Historical clusters are only included if they were last
318
+ used within the past 'days' days. Defaults to 30 days.
319
+
306
320
  Returns:
307
321
  A list of dicts, with each dict containing the cost information of a
308
322
  cluster.
309
323
  """
310
- cluster_reports = global_user_state.get_clusters_from_history()
324
+ if days is None:
325
+ days = constants.COST_REPORT_DEFAULT_DAYS
326
+
327
+ cluster_reports = global_user_state.get_clusters_from_history(days=days)
328
+ logger.debug(
329
+ f'{len(cluster_reports)} clusters found from history with {days} days.')
330
+
331
+ def _process_cluster_report(
332
+ cluster_report: Dict[str, Any]) -> Dict[str, Any]:
333
+ """Process cluster report by calculating cost and adding fields."""
334
+ # Make a copy to avoid modifying the original
335
+ report = cluster_report.copy()
336
+
337
+ def get_total_cost(cluster_report: dict) -> float:
338
+ duration = cluster_report['duration']
339
+ launched_nodes = cluster_report['num_nodes']
340
+ launched_resources = cluster_report['resources']
341
+
342
+ cost = (launched_resources.get_cost(duration) * launched_nodes)
343
+ return cost
344
+
345
+ def _update_record_with_resources(record: Dict[str, Any]) -> None:
346
+ """Add resource fields for dashboard compatibility."""
347
+ if record is None:
348
+ return
349
+ resources = record.get('resources')
350
+ if resources is None:
351
+ return
352
+ fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
353
+ for field in fields:
354
+ try:
355
+ record[field] = str(getattr(resources, field))
356
+ except Exception as e: # pylint: disable=broad-except
357
+ # Ok to skip the fields as this is just for display
358
+ # purposes.
359
+ logger.debug(f'Failed to get resources.{field} for cluster '
360
+ f'{record["name"]}: {str(e)}')
361
+ record[field] = None
362
+
363
+ # Add resources_str and resources_str_full for dashboard
364
+ # compatibility
365
+ num_nodes = record.get('num_nodes', 1)
366
+ try:
367
+ resource_str_simple = resources_utils.format_resource(
368
+ resources, simplify=True)
369
+ resource_str_full = resources_utils.format_resource(
370
+ resources, simplify=False)
371
+ record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
372
+ record[
373
+ 'resources_str_full'] = f'{num_nodes}x{resource_str_full}'
374
+ except Exception as e: # pylint: disable=broad-except
375
+ logger.debug(f'Failed to get resources_str for cluster '
376
+ f'{record["name"]}: {str(e)}')
377
+ for field in fields:
378
+ record[field] = None
379
+ record['resources_str'] = '-'
380
+ record['resources_str_full'] = '-'
381
+
382
+ try:
383
+ report['total_cost'] = get_total_cost(report)
384
+ except Exception as e: # pylint: disable=broad-except
385
+ # Ok to skip the total cost as this is just for display purposes.
386
+ logger.warning(f'Failed to get total cost for cluster '
387
+ f'{report["name"]}: {str(e)}')
388
+ report['total_cost'] = 0.0
311
389
 
312
- def get_total_cost(cluster_report: dict) -> float:
313
- duration = cluster_report['duration']
314
- launched_nodes = cluster_report['num_nodes']
315
- launched_resources = cluster_report['resources']
390
+ _update_record_with_resources(report)
391
+ return report
316
392
 
317
- cost = (launched_resources.get_cost(duration) * launched_nodes)
318
- return cost
393
+ # Process clusters in parallel
394
+ if not cluster_reports:
395
+ return []
319
396
 
320
- for cluster_report in cluster_reports:
321
- cluster_report['total_cost'] = get_total_cost(cluster_report)
322
- cluster_report['cloud'] = str(cluster_report['resources'].cloud)
323
- cluster_report['accelerators'] = cluster_report[
324
- 'resources'].accelerators
397
+ processed_reports = subprocess_utils.run_in_parallel(
398
+ _process_cluster_report, cluster_reports)
325
399
 
326
- return cluster_reports
400
+ return processed_reports
327
401
 
328
402
 
329
403
  def _start(
sky/dag.py CHANGED
@@ -83,6 +83,20 @@ class Dag:
83
83
  task.validate(skip_file_mounts=skip_file_mounts,
84
84
  skip_workdir=skip_workdir)
85
85
 
86
+ def resolve_and_validate_volumes(self) -> None:
87
+ for task in self.tasks:
88
+ task.resolve_and_validate_volumes()
89
+
90
+ def pre_mount_volumes(self) -> None:
91
+ vol_map = {}
92
+ # Deduplicate volume mounts.
93
+ for task in self.tasks:
94
+ if task.volume_mounts is not None:
95
+ for volume_mount in task.volume_mounts:
96
+ vol_map[volume_mount.volume_name] = volume_mount
97
+ for volume_mount in vol_map.values():
98
+ volume_mount.pre_mount()
99
+
86
100
 
87
101
  class _DagContext(threading.local):
88
102
  """A thread-local stack of Dags."""
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-0263b00d6a10e64a.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"F4kiZ6Zh72jA6HzZ3ncFo","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6133dc1e928bd0b5.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-0ef7418d1a3822f3.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"ZWdSYkqVe3WjnFR8ocqoG","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
@@ -0,0 +1 @@
1
+ self.__BUILD_MANIFEST=function(s,c,e,a,t,u,n,r,i,j,f,k,b,d){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-6b0d9e5031b70c58.js"],"/_error":["static/chunks/pages/_error-1be831200e60c5c0.js"],"/clusters":["static/chunks/pages/clusters-4aa031d1f42723d8.js"],"/clusters/[cluster]":[s,c,e,a,t,r,j,u,n,f,i,k,b,d,"static/chunks/37-1f1e94f5a561202a.js","static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js"],"/clusters/[cluster]/[job]":[s,c,e,a,t,u,n,"static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js"],"/config":["static/chunks/pages/config-3102d02a188f04b3.js"],"/infra":["static/chunks/pages/infra-fd5dc8a91bd9169a.js"],"/infra/[context]":["static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js"],"/jobs":["static/chunks/pages/jobs-26da173e20af16e4.js"],"/jobs/[job]":[s,c,e,a,t,r,u,n,i,"static/chunks/pages/jobs/[job]-e4b23128db0774cd.js"],"/users":["static/chunks/pages/users-ce29e7420385563d.js"],"/volumes":["static/chunks/pages/volumes-476b670ef33d1ecd.js"],"/workspace/new":["static/chunks/pages/workspace/new-09ae0f6f972aa871.js"],"/workspaces":["static/chunks/pages/workspaces-862b120406461b10.js"],"/workspaces/[name]":[s,c,e,a,t,r,j,u,n,f,i,k,b,d,"static/chunks/843-07d25a7e64462fd8.js","static/chunks/pages/workspaces/[name]-0b4c662a25e4747a.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/config","/infra","/infra/[context]","/jobs","/jobs/[job]","/users","/volumes","/workspace/new","/workspaces","/workspaces/[name]"]}}("static/chunks/616-d6128fa9e7cae6e6.js","static/chunks/230-d6e363362017ff3a.js","static/chunks/799-3625946b2ec2eb30.js","static/chunks/664-047bc03493fda379.js","static/chunks/804-4c9fc53aa74bc191.js","static/chunks/989-db34c16ad7ea6155.js","static/chunks/470-92dd1614396389be.js","static/chunks/798-c0525dc3f21e488d.js","static/chunks/969-d3a0b53f728d280a.js","static/chunks/947-6620842ef80ae879.js","static/chunks/66-66ae330df2d3c1c7.js","static/chunks/856-cdf66268ec878d0c.js","static/chunks/973-5b5019ba333e8d62.js","static/chunks/938-068520cc11738deb.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();