skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (63) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/shadeform.py +89 -0
  3. sky/authentication.py +52 -2
  4. sky/backends/backend_utils.py +35 -25
  5. sky/backends/cloud_vm_ray_backend.py +5 -5
  6. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  7. sky/catalog/kubernetes_catalog.py +19 -25
  8. sky/catalog/shadeform_catalog.py +165 -0
  9. sky/client/cli/command.py +53 -19
  10. sky/client/sdk.py +13 -1
  11. sky/clouds/__init__.py +2 -0
  12. sky/clouds/shadeform.py +393 -0
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/jobs/controller.py +122 -145
  30. sky/jobs/recovery_strategy.py +59 -82
  31. sky/jobs/scheduler.py +5 -5
  32. sky/jobs/state.py +65 -21
  33. sky/jobs/utils.py +58 -22
  34. sky/metrics/utils.py +27 -6
  35. sky/provision/__init__.py +1 -0
  36. sky/provision/kubernetes/utils.py +44 -39
  37. sky/provision/shadeform/__init__.py +11 -0
  38. sky/provision/shadeform/config.py +12 -0
  39. sky/provision/shadeform/instance.py +351 -0
  40. sky/provision/shadeform/shadeform_utils.py +83 -0
  41. sky/server/common.py +4 -2
  42. sky/server/requests/executor.py +25 -3
  43. sky/server/server.py +9 -3
  44. sky/setup_files/dependencies.py +1 -0
  45. sky/sky_logging.py +0 -2
  46. sky/skylet/constants.py +23 -6
  47. sky/skylet/log_lib.py +0 -1
  48. sky/skylet/log_lib.pyi +1 -1
  49. sky/templates/shadeform-ray.yml.j2 +72 -0
  50. sky/utils/common.py +2 -0
  51. sky/utils/context.py +57 -51
  52. sky/utils/context_utils.py +15 -11
  53. sky/utils/controller_utils.py +35 -8
  54. sky/utils/locks.py +20 -5
  55. sky/utils/subprocess_utils.py +4 -3
  56. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
  57. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
  58. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
  59. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
  60. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
  61. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
  62. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
  63. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
1
+ """ Shadeform | Catalog
2
+
3
+ This module loads pricing and instance information from the Shadeform API
4
+ and can be used to query instance types and pricing information for Shadeform.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ import pandas as pd
11
+
12
+ from sky.catalog import common
13
+
14
+ if typing.TYPE_CHECKING:
15
+ from sky.clouds import cloud
16
+
17
+ # We'll use dynamic fetching, so no static CSV file to load
18
+ _df = None
19
+
20
+
21
+ def _get_df():
22
+ """Get the dataframe, fetching from API if needed."""
23
+ global _df
24
+ if _df is None:
25
+ # For now, we'll fall back to a minimal static catalog
26
+ # In a full implementation, this would call the Shadeform API
27
+ # to dynamically fetch the latest instance types and pricing
28
+ try:
29
+ df = common.read_catalog('shadeform/vms.csv')
30
+ except FileNotFoundError:
31
+ # If no static catalog exists, create an empty one
32
+ # This would be replaced with dynamic API fetching
33
+ _df = pd.DataFrame(columns=[
34
+ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
35
+ 'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
36
+ ])
37
+ else:
38
+ df = df[df['InstanceType'].notna()]
39
+ if 'AcceleratorName' in df.columns:
40
+ df = df[df['AcceleratorName'].notna()]
41
+ df = df.assign(AcceleratorName=df['AcceleratorName'].astype(
42
+ str).str.strip())
43
+ _df = df.reset_index(drop=True)
44
+ return _df
45
+
46
+
47
+ def _is_not_found_error(err: ValueError) -> bool:
48
+ msg = str(err).lower()
49
+ return 'not found' in msg or 'not supported' in msg
50
+
51
+
52
+ def _call_or_default(func, default):
53
+ try:
54
+ return func()
55
+ except ValueError as err:
56
+ if _is_not_found_error(err):
57
+ return default
58
+ raise
59
+
60
+
61
+ def instance_type_exists(instance_type: str) -> bool:
62
+ """Check if an instance type exists."""
63
+ return common.instance_type_exists_impl(_get_df(), instance_type)
64
+
65
+
66
+ def validate_region_zone(
67
+ region: Optional[str],
68
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
69
+ """Validate region and zone for Shadeform."""
70
+ return common.validate_region_zone_impl('shadeform', _get_df(), region,
71
+ zone)
72
+
73
+
74
+ def get_hourly_cost(instance_type: str,
75
+ use_spot: bool = False,
76
+ region: Optional[str] = None,
77
+ zone: Optional[str] = None) -> float:
78
+ """Returns the cost, or the cheapest cost among all zones for spot."""
79
+ # Shadeform doesn't support spot instances currently
80
+ if use_spot:
81
+ raise ValueError('Spot instances are not supported on Shadeform')
82
+
83
+ return common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
84
+ region, zone)
85
+
86
+
87
+ def get_vcpus_mem_from_instance_type(
88
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
89
+ """Get vCPUs and memory from instance type."""
90
+ return _call_or_default(
91
+ lambda: common.get_vcpus_mem_from_instance_type_impl(
92
+ _get_df(), instance_type), (None, None))
93
+
94
+
95
+ def get_default_instance_type(cpus: Optional[str] = None,
96
+ memory: Optional[str] = None,
97
+ disk_tier: Optional[str] = None,
98
+ region: Optional[str] = None,
99
+ zone: Optional[str] = None) -> Optional[str]:
100
+ """Get default instance type based on requirements."""
101
+ del disk_tier # Shadeform doesn't support custom disk tiers yet
102
+ return _call_or_default(
103
+ lambda: common.get_instance_type_for_cpus_mem_impl(
104
+ _get_df(), cpus, memory, region, zone), None)
105
+
106
+
107
+ def get_accelerators_from_instance_type(
108
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
109
+ """Get accelerator information from instance type."""
110
+ return _call_or_default(
111
+ lambda: common.get_accelerators_from_instance_type_impl(
112
+ _get_df(), instance_type), None)
113
+
114
+
115
+ def get_instance_type_for_accelerator(
116
+ acc_name: str,
117
+ acc_count: int,
118
+ cpus: Optional[str] = None,
119
+ memory: Optional[str] = None,
120
+ use_spot: bool = False,
121
+ region: Optional[str] = None,
122
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
123
+ """Returns a list of instance types that have the given accelerator."""
124
+ if use_spot:
125
+ # Return empty lists since spot is not supported
126
+ return None, ['Spot instances are not supported on Shadeform']
127
+
128
+ return _call_or_default(
129
+ lambda: common.get_instance_type_for_accelerator_impl(
130
+ df=_get_df(),
131
+ acc_name=acc_name,
132
+ acc_count=acc_count,
133
+ cpus=cpus,
134
+ memory=memory,
135
+ use_spot=use_spot,
136
+ region=region,
137
+ zone=zone), (None, []))
138
+
139
+
140
+ def get_region_zones_for_instance_type(instance_type: str,
141
+ use_spot: bool) -> List['cloud.Region']:
142
+ """Get regions and zones for an instance type."""
143
+ if use_spot:
144
+ return [] # No spot support
145
+
146
+ df = _get_df()
147
+ df_filtered = df[df['InstanceType'] == instance_type]
148
+ return _call_or_default(
149
+ lambda: common.get_region_zones(df_filtered, use_spot), [])
150
+
151
+
152
+ def list_accelerators(
153
+ gpus_only: bool,
154
+ name_filter: Optional[str],
155
+ region_filter: Optional[str],
156
+ quantity_filter: Optional[int],
157
+ case_sensitive: bool = True,
158
+ all_regions: bool = False,
159
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
160
+ """Returns all instance types in Shadeform offering GPUs."""
161
+ del require_price # Unused.
162
+ return common.list_accelerators_impl('Shadeform', _get_df(), gpus_only,
163
+ name_filter, region_filter,
164
+ quantity_filter, case_sensitive,
165
+ all_regions)
sky/client/cli/command.py CHANGED
@@ -158,12 +158,17 @@ def _get_cluster_records_and_set_ssh_config(
158
158
  # Update the SSH config for all clusters
159
159
  for record in cluster_records:
160
160
  handle = record['handle']
161
-
161
+ name = record['name']
162
162
  if not (handle is not None and handle.cached_external_ips is not None
163
163
  and 'credentials' in record):
164
164
  # If the cluster is not UP or does not have credentials available,
165
165
  # we need to remove the cluster from the SSH config.
166
- cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
166
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
167
+ continue
168
+ if not record['credentials']:
169
+ # The credential is missing for some reason, continue.
170
+ logger.debug(
171
+ f'Client did not receive SSH credential for cluster {name}')
167
172
  continue
168
173
 
169
174
  # During the failover, even though a cluster does not exist, the handle
@@ -1868,7 +1873,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1868
1873
  controllers = []
1869
1874
  for cluster_record in cluster_records:
1870
1875
  cluster_name = cluster_record['name']
1871
- controller = controller_utils.Controllers.from_name(cluster_name)
1876
+ controller = controller_utils.Controllers.from_name(
1877
+ cluster_name, expect_exact_match=False)
1872
1878
  if controller is not None:
1873
1879
  controllers.append(cluster_record)
1874
1880
  else:
@@ -2034,7 +2040,8 @@ def cost_report(all: bool, days: int): # pylint: disable=redefined-builtin
2034
2040
  for cluster_record in cluster_records:
2035
2041
  cluster_name = cluster_record['name']
2036
2042
  try:
2037
- controller = controller_utils.Controllers.from_name(cluster_name)
2043
+ controller = controller_utils.Controllers.from_name(
2044
+ cluster_name, expect_exact_match=False)
2038
2045
  except AssertionError:
2039
2046
  # There could be some old controller clusters from previous
2040
2047
  # versions that we should not show in the cost report.
@@ -2406,7 +2413,8 @@ def cancel(
2406
2413
  job_ids=job_ids_to_cancel)
2407
2414
  _async_call_or_wait(request_id, async_call, 'sky.cancel')
2408
2415
  except exceptions.NotSupportedError as e:
2409
- controller = controller_utils.Controllers.from_name(cluster)
2416
+ controller = controller_utils.Controllers.from_name(
2417
+ cluster, expect_exact_match=False)
2410
2418
  assert controller is not None, cluster
2411
2419
  with ux_utils.print_exception_no_traceback():
2412
2420
  raise click.UsageError(
@@ -2707,7 +2715,8 @@ def start(
2707
2715
  # Get all clusters that are not controllers.
2708
2716
  cluster_records = [
2709
2717
  cluster for cluster in all_clusters
2710
- if controller_utils.Controllers.from_name(cluster['name']) is None
2718
+ if controller_utils.Controllers.from_name(
2719
+ cluster['name'], expect_exact_match=False) is None
2711
2720
  ]
2712
2721
  if cluster_records is None:
2713
2722
  # Get GLOB cluster names
@@ -2769,7 +2778,8 @@ def start(
2769
2778
  # Checks for controller clusters (jobs controller / sky serve controller).
2770
2779
  controllers, normal_clusters = [], []
2771
2780
  for name in to_start:
2772
- if controller_utils.Controllers.from_name(name) is not None:
2781
+ if controller_utils.Controllers.from_name(
2782
+ name, expect_exact_match=False) is not None:
2773
2783
  controllers.append(name)
2774
2784
  else:
2775
2785
  normal_clusters.append(name)
@@ -2905,7 +2915,8 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2905
2915
  to be torn down (e.g., because it has jobs running or
2906
2916
  it is in init state)
2907
2917
  """
2908
- controller = controller_utils.Controllers.from_name(controller_name)
2918
+ controller = controller_utils.Controllers.from_name(
2919
+ controller_name, expect_exact_match=False)
2909
2920
  assert controller is not None, controller_name
2910
2921
 
2911
2922
  with rich_utils.client_status(
@@ -3004,7 +3015,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
3004
3015
  to be torn down (e.g., because it has services running or
3005
3016
  it is in init state)
3006
3017
  """
3007
- controller = controller_utils.Controllers.from_name(controller_name)
3018
+ controller = controller_utils.Controllers.from_name(
3019
+ controller_name, expect_exact_match=False)
3008
3020
  assert controller is not None, controller_name
3009
3021
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
3010
3022
  try:
@@ -3115,14 +3127,15 @@ def _down_or_stop_clusters(
3115
3127
  names = list(names)
3116
3128
  if names:
3117
3129
  controllers = [
3118
- name for name in names
3119
- if controller_utils.Controllers.from_name(name) is not None
3130
+ name for name in names if controller_utils.Controllers.from_name(
3131
+ name, expect_exact_match=False) is not None
3120
3132
  ]
3121
3133
  controllers_str = ', '.join(map(repr, controllers))
3122
3134
  names = [
3123
3135
  cluster['name']
3124
3136
  for cluster in _get_cluster_records_and_set_ssh_config(names)
3125
- if controller_utils.Controllers.from_name(cluster['name']) is None
3137
+ if controller_utils.Controllers.from_name(
3138
+ cluster['name'], expect_exact_match=False) is None
3126
3139
  ]
3127
3140
 
3128
3141
  # Make sure the controllers are explicitly specified without other
@@ -3147,7 +3160,7 @@ def _down_or_stop_clusters(
3147
3160
  f'{controllers_str} is currently not supported.')
3148
3161
  else:
3149
3162
  controller = controller_utils.Controllers.from_name(
3150
- controller_name)
3163
+ controller_name, expect_exact_match=False)
3151
3164
  assert controller is not None
3152
3165
  hint_or_raise = _controller_to_hint_or_raise(controller)
3153
3166
  try:
@@ -3195,9 +3208,10 @@ def _down_or_stop_clusters(
3195
3208
  names = [
3196
3209
  record['name']
3197
3210
  for record in all_clusters
3198
- if controller_utils.Controllers.from_name(record['name']) is None
3199
- and (down or idle_minutes_to_autostop is not None or
3200
- record['status'] != status_lib.ClusterStatus.STOPPED)
3211
+ if controller_utils.Controllers.from_name(
3212
+ record['name'], expect_exact_match=False) is None and
3213
+ (down or idle_minutes_to_autostop is not None or
3214
+ record['status'] != status_lib.ClusterStatus.STOPPED)
3201
3215
  ]
3202
3216
 
3203
3217
  clusters = names
@@ -3227,6 +3241,9 @@ def _down_or_stop_clusters(
3227
3241
 
3228
3242
  request_ids = []
3229
3243
 
3244
+ successes: List[str] = []
3245
+ failures: List[Tuple[str, str]] = []
3246
+
3230
3247
  def _down_or_stop(name: str):
3231
3248
  success_progress = False
3232
3249
  if idle_minutes_to_autostop is not None:
@@ -3237,9 +3254,10 @@ def _down_or_stop_clusters(
3237
3254
  _async_call_or_wait(
3238
3255
  request_id, async_call,
3239
3256
  server_constants.REQUEST_NAME_PREFIX + operation)
3240
- except (exceptions.NotSupportedError,
3241
- exceptions.ClusterNotUpError) as e:
3257
+ except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
3258
+ exceptions.CloudError) as e:
3242
3259
  message = str(e)
3260
+ failures.append((name, str(e)))
3243
3261
  else: # no exception raised
3244
3262
  success_progress = True
3245
3263
  message = (f'{colorama.Fore.GREEN}{operation} '
@@ -3275,13 +3293,17 @@ def _down_or_stop_clusters(
3275
3293
  f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
3276
3294
  f'{colorama.Style.RESET_ALL}'
3277
3295
  f'\nReason: {common_utils.format_exception(e)}.')
3296
+ failures.append((name, str(e)))
3278
3297
  except (exceptions.NotSupportedError,
3279
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3298
+ exceptions.ClusterOwnerIdentityMismatchError,
3299
+ exceptions.CloudError) as e:
3280
3300
  message = str(e)
3301
+ failures.append((name, str(e)))
3281
3302
  else: # no exception raised
3282
3303
  message = (
3283
3304
  f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
3284
3305
  f'{colorama.Style.RESET_ALL}')
3306
+ successes.append(name)
3285
3307
  if not down:
3286
3308
  message += ('\n To restart the cluster, run: '
3287
3309
  f'{colorama.Style.BRIGHT}sky start {name}'
@@ -3304,6 +3326,18 @@ def _down_or_stop_clusters(
3304
3326
  click.secho(f'{operation} requests are sent. Check the requests\' '
3305
3327
  'status with `sky request get <request_id>`.')
3306
3328
 
3329
+ click.echo('\nSummary:')
3330
+ if successes:
3331
+ click.echo(' ✓ Succeeded: ' + ', '.join(successes))
3332
+ if failures:
3333
+ failed_pretty = []
3334
+ for name, reason in failures:
3335
+ first = reason.strip().splitlines()[0]
3336
+ first = first if len(first) <= 120 else first[:120] + '…'
3337
+ failed_pretty.append(f'{name} ({first})')
3338
+ click.echo(' ✗ Failed: ' + ', '.join(failed_pretty))
3339
+ raise click.ClickException('Some clusters failed. See summary above.')
3340
+
3307
3341
 
3308
3342
  @cli.command(cls=_DocumentedCodeCommand)
3309
3343
  @flags.config_option(expose_value=False)
sky/client/sdk.py CHANGED
@@ -98,6 +98,9 @@ def reload_config() -> None:
98
98
  skypilot_config.safe_reload_config()
99
99
 
100
100
 
101
+ # The overloads are not comprehensive - e.g. get_result Literal[False] could be
102
+ # specified to return None. We can add more overloads if needed. To do that see
103
+ # https://github.com/python/mypy/issues/8634#issuecomment-609411104
101
104
  @typing.overload
102
105
  def stream_response(request_id: None,
103
106
  response: 'requests.Response',
@@ -112,7 +115,16 @@ def stream_response(request_id: server_common.RequestId[T],
112
115
  response: 'requests.Response',
113
116
  output_stream: Optional['io.TextIOBase'] = None,
114
117
  resumable: bool = False,
115
- get_result: bool = True) -> T:
118
+ get_result: Literal[True] = True) -> T:
119
+ ...
120
+
121
+
122
+ @typing.overload
123
+ def stream_response(request_id: server_common.RequestId[T],
124
+ response: 'requests.Response',
125
+ output_stream: Optional['io.TextIOBase'] = None,
126
+ resumable: bool = False,
127
+ get_result: bool = True) -> Optional[T]:
116
128
  ...
117
129
 
118
130
 
sky/clouds/__init__.py CHANGED
@@ -30,6 +30,7 @@ from sky.clouds.primeintellect import PrimeIntellect
30
30
  from sky.clouds.runpod import RunPod
31
31
  from sky.clouds.scp import SCP
32
32
  from sky.clouds.seeweb import Seeweb
33
+ from sky.clouds.shadeform import Shadeform
33
34
  from sky.clouds.ssh import SSH
34
35
  from sky.clouds.vast import Vast
35
36
  from sky.clouds.vsphere import Vsphere
@@ -48,6 +49,7 @@ __all__ = [
48
49
  'PrimeIntellect',
49
50
  'SCP',
50
51
  'RunPod',
52
+ 'Shadeform',
51
53
  'Vast',
52
54
  'OCI',
53
55
  'Vsphere',