skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (63) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/shadeform.py +89 -0
  3. sky/authentication.py +52 -2
  4. sky/backends/backend_utils.py +35 -25
  5. sky/backends/cloud_vm_ray_backend.py +5 -5
  6. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  7. sky/catalog/kubernetes_catalog.py +19 -25
  8. sky/catalog/shadeform_catalog.py +165 -0
  9. sky/client/cli/command.py +53 -19
  10. sky/client/sdk.py +13 -1
  11. sky/clouds/__init__.py +2 -0
  12. sky/clouds/shadeform.py +393 -0
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/jobs/controller.py +122 -145
  30. sky/jobs/recovery_strategy.py +59 -82
  31. sky/jobs/scheduler.py +5 -5
  32. sky/jobs/state.py +65 -21
  33. sky/jobs/utils.py +58 -22
  34. sky/metrics/utils.py +27 -6
  35. sky/provision/__init__.py +1 -0
  36. sky/provision/kubernetes/utils.py +44 -39
  37. sky/provision/shadeform/__init__.py +11 -0
  38. sky/provision/shadeform/config.py +12 -0
  39. sky/provision/shadeform/instance.py +351 -0
  40. sky/provision/shadeform/shadeform_utils.py +83 -0
  41. sky/server/common.py +4 -2
  42. sky/server/requests/executor.py +25 -3
  43. sky/server/server.py +9 -3
  44. sky/setup_files/dependencies.py +1 -0
  45. sky/sky_logging.py +0 -2
  46. sky/skylet/constants.py +23 -6
  47. sky/skylet/log_lib.py +0 -1
  48. sky/skylet/log_lib.pyi +1 -1
  49. sky/templates/shadeform-ray.yml.j2 +72 -0
  50. sky/utils/common.py +2 -0
  51. sky/utils/context.py +57 -51
  52. sky/utils/context_utils.py +15 -11
  53. sky/utils/controller_utils.py +35 -8
  54. sky/utils/locks.py +20 -5
  55. sky/utils/subprocess_utils.py +4 -3
  56. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
  57. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
  58. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
  59. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
  60. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
  61. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
  62. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
  63. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,351 @@
1
+ """Shadeform instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import requests
6
+
7
+ from sky import sky_logging
8
+ from sky.provision import common
9
+ from sky.provision.shadeform import shadeform_utils
10
+ from sky.utils import status_lib
11
+
12
+ POLL_INTERVAL = 10
13
+ INSTANCE_READY_TIMEOUT = 3600
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ # Status mapping from Shadeform to SkyPilot
18
+ SHADEFORM_STATUS_MAP = {
19
+ 'creating': status_lib.ClusterStatus.INIT,
20
+ 'pending_provider': status_lib.ClusterStatus.INIT,
21
+ 'pending': status_lib.ClusterStatus.INIT,
22
+ 'active': status_lib.ClusterStatus.UP,
23
+ 'deleted': status_lib.ClusterStatus.STOPPED,
24
+ }
25
+
26
+
27
+ def _get_cluster_instances(cluster_name_on_cloud: str) -> Dict[str, Any]:
28
+ """Get all instances belonging to a cluster."""
29
+ try:
30
+ response = shadeform_utils.get_instances()
31
+ instances = response.get('instances', [])
32
+
33
+ cluster_instances = {}
34
+ possible_names = [
35
+ f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
36
+ ]
37
+
38
+ for instance in instances:
39
+ if instance.get('name') in possible_names:
40
+ cluster_instances[instance['id']] = instance
41
+
42
+ return cluster_instances
43
+ except (ValueError, KeyError, requests.exceptions.RequestException) as e:
44
+ logger.warning(f'Failed to get instances: {e}')
45
+ return {}
46
+
47
+
48
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
49
+ """Get the head instance ID from a list of instances."""
50
+ for instance_id, instance in instances.items():
51
+ if instance.get('name', '').endswith('-head'):
52
+ return instance_id
53
+ return None
54
+
55
+
56
+ def _wait_for_instances_ready(cluster_name_on_cloud: str,
57
+ expected_count: int,
58
+ timeout: int = INSTANCE_READY_TIMEOUT) -> bool:
59
+ """Wait for instances to be ready (active state with SSH access)."""
60
+ start_time = time.time()
61
+
62
+ while time.time() - start_time < timeout:
63
+ instances = _get_cluster_instances(cluster_name_on_cloud)
64
+ ready_count = 0
65
+
66
+ for instance in instances.values():
67
+ if (instance.get('status') == 'active' and
68
+ instance.get('ip') is not None and
69
+ instance.get('ssh_port') is not None):
70
+ ready_count += 1
71
+
72
+ logger.info(f'Waiting for instances to be ready: '
73
+ f'({ready_count}/{expected_count})')
74
+
75
+ if ready_count >= expected_count:
76
+ return True
77
+
78
+ time.sleep(POLL_INTERVAL)
79
+
80
+ return False
81
+
82
+
83
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
84
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
85
+ """Run instances for the given cluster."""
86
+ del cluster_name # unused - we use cluster_name_on_cloud
87
+ logger.info(f'Running instances for cluster {cluster_name_on_cloud} '
88
+ f'in region {region}')
89
+ logger.debug(f'DEBUG: region type={type(region)}, value={region!r}')
90
+ logger.debug(f'DEBUG: config node_config={config.node_config}')
91
+
92
+ # Check existing instances
93
+ existing_instances = _get_cluster_instances(cluster_name_on_cloud)
94
+ head_instance_id = _get_head_instance_id(existing_instances)
95
+
96
+ # Filter active instances
97
+ active_instances = {
98
+ iid: inst
99
+ for iid, inst in existing_instances.items()
100
+ if inst.get('status') == 'active'
101
+ }
102
+
103
+ current_count = len(active_instances)
104
+ target_count = config.count
105
+
106
+ logger.info(f'Current instances: {current_count}, target: {target_count}')
107
+
108
+ if current_count >= target_count:
109
+ if head_instance_id is None:
110
+ raise RuntimeError(
111
+ f'Cluster {cluster_name_on_cloud} has no head node')
112
+ logger.info(f'Cluster already has {current_count} instances, '
113
+ f'no need to start more')
114
+ return common.ProvisionRecord(
115
+ provider_name='shadeform',
116
+ cluster_name=cluster_name_on_cloud,
117
+ region=region,
118
+ zone=None, # Shadeform doesn't use separate zones
119
+ head_instance_id=head_instance_id,
120
+ resumed_instance_ids=[],
121
+ created_instance_ids=[])
122
+
123
+ # Create new instances
124
+ to_create = target_count - current_count
125
+ created_instance_ids = []
126
+
127
+ for _ in range(to_create):
128
+ node_type = 'head' if head_instance_id is None else 'worker'
129
+ instance_name = f'{cluster_name_on_cloud}-{node_type}'
130
+
131
+ # Extract configuration from node_config
132
+
133
+ # The node_config contains instance specs including InstanceType
134
+ # which follows the format: {cloud_provider}_{instance_type}
135
+ # (e.g., "massedcompute_A6000_basex2")
136
+ node_config = config.node_config
137
+ assert 'InstanceType' in node_config, \
138
+ 'InstanceType must be present in node_config'
139
+
140
+ # Parse the instance type to extract cloud provider and instance specs
141
+ # Expected format: "{cloud}_{instance_type}" where cloud is provider
142
+ # (massedcompute, scaleway, lambda, etc.)
143
+ instance_type_full = node_config['InstanceType']
144
+ assert (isinstance(instance_type_full, str) and
145
+ '_' in instance_type_full), \
146
+ f'InstanceType must be in format cloud_instance_type, got: ' \
147
+ f'{instance_type_full}'
148
+
149
+ instance_type_split = instance_type_full.split('_')
150
+ assert len(instance_type_split) >= 2, \
151
+ f'InstanceType must contain at least one underscore, got: ' \
152
+ f'{instance_type_full}'
153
+
154
+ # Extract cloud provider (first part) and instance type (remaining)
155
+ # Example: "massedcompute_A6000-basex2" -> cloud="massedcompute",
156
+ # instance_type="A6000-basex2"
157
+ cloud = instance_type_split[0]
158
+ instance_type = '_'.join(instance_type_split[1:])
159
+
160
+ # Shadeform uses underscores instead of hyphens
161
+ instance_type = instance_type.replace('-', '_')
162
+
163
+ if instance_type.endswith('B'):
164
+ instance_type = instance_type[:-1]
165
+
166
+ # Replace "GBx" with "Gx" (case sensitive)
167
+ if 'GBx' in instance_type:
168
+ instance_type = instance_type.replace('GBx', 'Gx')
169
+
170
+ assert cloud, 'Cloud provider cannot be empty'
171
+ assert instance_type, 'Instance type cannot be empty'
172
+
173
+ # Get SSH key ID for authentication - this is optional and may be None
174
+ ssh_key_id = config.authentication_config.get('ssh_key_id')
175
+
176
+ create_config = {
177
+ 'cloud': cloud,
178
+ 'region': region,
179
+ 'shade_instance_type': instance_type,
180
+ 'name': instance_name,
181
+ 'ssh_key_id': ssh_key_id
182
+ }
183
+
184
+ try:
185
+ logger.info(f'Creating {node_type} instance: {instance_name}')
186
+ response = shadeform_utils.create_instance(create_config)
187
+ instance_id = response['id']
188
+ created_instance_ids.append(instance_id)
189
+
190
+ if head_instance_id is None:
191
+ head_instance_id = instance_id
192
+
193
+ logger.info(f'Created instance {instance_id} ({node_type})')
194
+
195
+ except Exception as e:
196
+ logger.error(f'Failed to create instance: {e}')
197
+ # Clean up any created instances
198
+ for iid in created_instance_ids:
199
+ try:
200
+ shadeform_utils.delete_instance(iid)
201
+ except requests.exceptions.RequestException as cleanup_e:
202
+ logger.warning(
203
+ f'Failed to cleanup instance {iid}: {cleanup_e}')
204
+ raise
205
+
206
+ # Wait for all instances to be ready
207
+ logger.info('Waiting for instances to become ready...')
208
+ if not _wait_for_instances_ready(cluster_name_on_cloud, target_count):
209
+ raise RuntimeError('Timed out waiting for instances to be ready')
210
+
211
+ assert head_instance_id is not None, 'head_instance_id should not be None'
212
+
213
+ return common.ProvisionRecord(provider_name='shadeform',
214
+ cluster_name=cluster_name_on_cloud,
215
+ region=region,
216
+ zone=region,
217
+ head_instance_id=head_instance_id,
218
+ resumed_instance_ids=[],
219
+ created_instance_ids=created_instance_ids)
220
+
221
+
222
+ def wait_instances(region: str, cluster_name_on_cloud: str,
223
+ state: Optional[status_lib.ClusterStatus]) -> None:
224
+ """Wait for instances to reach the specified state."""
225
+ del region, cluster_name_on_cloud, state # unused
226
+ # For Shadeform, instances are ready when they reach 'active' status
227
+ # This is already handled in run_instances
228
+
229
+
230
+ def stop_instances(cluster_name_on_cloud: str,
231
+ provider_config: Optional[Dict[str, Any]] = None,
232
+ worker_only: bool = False) -> None:
233
+ """Stop instances (not supported by Shadeform)."""
234
+ del cluster_name_on_cloud, provider_config, worker_only # unused
235
+ raise NotImplementedError(
236
+ 'Stopping instances is not supported by Shadeform')
237
+
238
+
239
+ def terminate_instances(cluster_name_on_cloud: str,
240
+ provider_config: Optional[Dict[str, Any]] = None,
241
+ worker_only: bool = False) -> None:
242
+ """Terminate instances."""
243
+ del provider_config # unused
244
+ logger.info(f'Terminating instances for cluster {cluster_name_on_cloud}')
245
+
246
+ instances = _get_cluster_instances(cluster_name_on_cloud)
247
+
248
+ if not instances:
249
+ logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
250
+ return
251
+
252
+ instances_to_delete = instances
253
+ if worker_only:
254
+ # Only delete worker nodes, not head
255
+ instances_to_delete = {
256
+ iid: inst
257
+ for iid, inst in instances.items()
258
+ if not inst.get('name', '').endswith('-head')
259
+ }
260
+
261
+ for instance_id, instance in instances_to_delete.items():
262
+ try:
263
+ logger.info(
264
+ f'Terminating instance {instance_id} ({instance.get("name")})')
265
+ shadeform_utils.delete_instance(instance_id)
266
+ except requests.exceptions.RequestException as e:
267
+ logger.warning(f'Failed to terminate instance {instance_id}: {e}')
268
+
269
+
270
+ def get_cluster_info(
271
+ region: str,
272
+ cluster_name_on_cloud: str,
273
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
274
+ """Get cluster information."""
275
+ del region, provider_config # unused
276
+ instances = _get_cluster_instances(cluster_name_on_cloud)
277
+
278
+ if not instances:
279
+ return common.ClusterInfo(instances={},
280
+ head_instance_id=None,
281
+ provider_name='shadeform')
282
+
283
+ head_instance_id = _get_head_instance_id(instances)
284
+
285
+ # Convert instance format for ClusterInfo
286
+ cluster_instances = {}
287
+ for instance_id, instance in instances.items():
288
+ instance_info = common.InstanceInfo(
289
+ instance_id=instance_id,
290
+ internal_ip=instance.get('ip', ''),
291
+ external_ip=instance.get('ip', ''),
292
+ ssh_port=instance.get('ssh_port', 22),
293
+ tags={},
294
+ )
295
+ # ClusterInfo expects Dict[InstanceId, List[InstanceInfo]]
296
+ cluster_instances[instance_id] = [instance_info]
297
+
298
+ ssh_user = 'shadeform' # default
299
+ if head_instance_id is not None:
300
+ ssh_user = instances.get(head_instance_id,
301
+ {}).get('ssh_user', 'shadeform')
302
+
303
+ return common.ClusterInfo(instances=cluster_instances,
304
+ head_instance_id=head_instance_id,
305
+ provider_name='shadeform',
306
+ ssh_user=ssh_user)
307
+
308
+
309
+ def query_instances(
310
+ cluster_name: str,
311
+ cluster_name_on_cloud: str,
312
+ provider_config: Optional[Dict[str, Any]] = None,
313
+ non_terminated_only: bool = True,
314
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
315
+ """Query the status of instances."""
316
+ del cluster_name, provider_config # unused
317
+ instances = _get_cluster_instances(cluster_name_on_cloud)
318
+
319
+ if not instances:
320
+ return {}
321
+
322
+ status_map: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
323
+ Optional[str]]] = {}
324
+ for instance_id, instance in instances.items():
325
+ shadeform_status = instance.get('status', 'unknown')
326
+ sky_status = SHADEFORM_STATUS_MAP.get(shadeform_status,
327
+ status_lib.ClusterStatus.INIT)
328
+
329
+ if (non_terminated_only and
330
+ sky_status == status_lib.ClusterStatus.STOPPED):
331
+ continue
332
+
333
+ status_map[instance_id] = (sky_status, None)
334
+
335
+ return status_map
336
+
337
+
338
+ def open_ports(cluster_name_on_cloud: str,
339
+ ports: List[str],
340
+ provider_config: Optional[Dict[str, Any]] = None) -> None:
341
+ """Open ports (not supported by Shadeform)."""
342
+ del cluster_name_on_cloud, ports, provider_config # unused
343
+ raise NotImplementedError()
344
+
345
+
346
+ def cleanup_ports(cluster_name_on_cloud: str,
347
+ ports: List[str],
348
+ provider_config: Optional[Dict[str, Any]] = None) -> None:
349
+ """Cleanup ports (not supported by Shadeform)."""
350
+ del cluster_name_on_cloud, ports, provider_config # unused
351
+ # Nothing to cleanup since we don't support dynamic port opening
@@ -0,0 +1,83 @@
1
+ """Shadeform API utilities."""
2
+
3
+ import os
4
+ from typing import Any, Dict
5
+
6
+ from sky.adaptors import common
7
+
8
+ # Lazy import to avoid dependency on external packages
9
+ requests = common.LazyImport('requests')
10
+
11
+ # Shadeform API configuration
12
+ SHADEFORM_API_BASE = 'https://api.shadeform.ai/v1'
13
+ SHADEFORM_API_KEY_PATH = '~/.shadeform/api_key'
14
+
15
+
16
+ def get_api_key() -> str:
17
+ """Get Shadeform API key from file."""
18
+ api_key_path = os.path.expanduser(SHADEFORM_API_KEY_PATH)
19
+ if not os.path.exists(api_key_path):
20
+ raise FileNotFoundError(
21
+ f'Shadeform API key not found at {api_key_path}. '
22
+ 'Please save your API key to this file.')
23
+
24
+ with open(api_key_path, 'r', encoding='utf-8') as f:
25
+ api_key = f.read().strip()
26
+
27
+ if not api_key:
28
+ raise ValueError(f'Shadeform API key is empty in {api_key_path}')
29
+
30
+ return api_key
31
+
32
+
33
+ def make_request(method: str, endpoint: str, **kwargs) -> Any:
34
+ """Make a request to the Shadeform API."""
35
+ url = f'{SHADEFORM_API_BASE}/{endpoint.lstrip("/")}'
36
+ headers = {
37
+ 'X-API-KEY': get_api_key(),
38
+ 'Content-Type': 'application/json',
39
+ }
40
+
41
+ response = requests.request(method, url, headers=headers, **kwargs)
42
+ response.raise_for_status()
43
+
44
+ # Some APIs (like delete) return empty responses with just 200 status
45
+ if response.text.strip():
46
+ return response.json()
47
+ else:
48
+ # Return empty dict for empty responses (e.g., delete operations)
49
+ return {}
50
+
51
+
52
+ def get_instances() -> Dict[str, Any]:
53
+ """Get all instances."""
54
+ return make_request('GET', '/instances')
55
+
56
+
57
+ def get_instance_info(instance_id: str) -> Dict[str, Any]:
58
+ """Get information about a specific instance."""
59
+ return make_request('GET', f'/instances/{instance_id}/info')
60
+
61
+
62
+ def create_instance(config: Dict[str, Any]) -> Dict[str, Any]:
63
+ """Create a new instance."""
64
+ return make_request('POST', '/instances/create', json=config)
65
+
66
+
67
+ def delete_instance(instance_id: str) -> Dict[str, Any]:
68
+ """Delete an instance.
69
+
70
+ Note: Shadeform delete API returns empty response with 200 status.
71
+ """
72
+ return make_request('POST', f'/instances/{instance_id}/delete')
73
+
74
+
75
+ def get_ssh_keys() -> Dict[str, Any]:
76
+ """Get all SSH keys."""
77
+ return make_request('GET', '/sshkeys')
78
+
79
+
80
+ def add_ssh_key(name: str, public_key: str) -> Dict[str, Any]:
81
+ """Add a new SSH key."""
82
+ config = {'name': name, 'public_key': public_key}
83
+ return make_request('POST', '/sshkeys/add', json=config)
sky/server/common.py CHANGED
@@ -554,8 +554,8 @@ def _start_api_server(deploy: bool = False,
554
554
  # pylint: disable=import-outside-toplevel
555
555
  import sky.jobs.utils as job_utils
556
556
  max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
557
- if job_utils.is_consolidation_mode() else
558
- server_constants.MIN_AVAIL_MEM_GB)
557
+ if job_utils.is_consolidation_mode(on_api_restart=True)
558
+ else server_constants.MIN_AVAIL_MEM_GB)
559
559
  if avail_mem_size_gb <= max_memory:
560
560
  logger.warning(
561
561
  f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
@@ -571,6 +571,8 @@ def _start_api_server(deploy: bool = False,
571
571
  args += [f'--host={host}']
572
572
  if metrics_port is not None:
573
573
  args += [f'--metrics-port={metrics_port}']
574
+ # Use this argument to disable the internal signal file check.
575
+ args += ['--start-with-python']
574
576
 
575
577
  if foreground:
576
578
  # Replaces the current process with the API server
@@ -81,6 +81,26 @@ logger = sky_logging.init_logger(__name__)
81
81
  # platforms, including macOS.
82
82
  multiprocessing.set_start_method('spawn', force=True)
83
83
 
84
+ # Max threads that is equivalent to the number of thread workers in the
85
+ # default thread pool executor of event loop.
86
+ _REQUEST_THREADS_LIMIT = min(32, (os.cpu_count() or 0) + 4)
87
+
88
+ _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
89
+ # A dedicated thread pool executor for synced requests execution in coroutine
90
+ _REQUEST_THREAD_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
91
+
92
+
93
+ def get_request_thread_executor() -> concurrent.futures.ThreadPoolExecutor:
94
+ """Lazy init and return the request thread executor for current process."""
95
+ global _REQUEST_THREAD_EXECUTOR
96
+ if _REQUEST_THREAD_EXECUTOR is not None:
97
+ return _REQUEST_THREAD_EXECUTOR
98
+ with _REQUEST_THREAD_EXECUTOR_LOCK:
99
+ if _REQUEST_THREAD_EXECUTOR is None:
100
+ _REQUEST_THREAD_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
101
+ max_workers=_REQUEST_THREADS_LIMIT)
102
+ return _REQUEST_THREAD_EXECUTOR
103
+
84
104
 
85
105
  class RequestQueue:
86
106
  """The queue for the requests, either redis or multiprocessing.
@@ -404,6 +424,7 @@ def _request_execution_wrapper(request_id: str,
404
424
  os.close(original_stderr)
405
425
  original_stderr = None
406
426
 
427
+ request_name = None
407
428
  try:
408
429
  # As soon as the request is updated with the executor PID, we can
409
430
  # receive SIGTERM from cancellation. So, we update the request inside
@@ -495,7 +516,8 @@ def _request_execution_wrapper(request_id: str,
495
516
  annotations.clear_request_level_cache()
496
517
  with metrics_utils.time_it(name='release_memory', group='internal'):
497
518
  common_utils.release_memory()
498
- _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
519
+ if request_name is not None:
520
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
499
521
  except Exception as e: # pylint: disable=broad-except
500
522
  logger.error(f'Failed to record memory metrics: '
501
523
  f'{common_utils.format_exception(e)}')
@@ -576,8 +598,8 @@ async def _execute_request_coroutine(request: api_requests.Request):
576
598
  # 1. skypilot config is not contextual
577
599
  # 2. envs that read directly from os.environ are not contextual
578
600
  ctx.override_envs(request_body.env_vars)
579
- fut: asyncio.Future = context_utils.to_thread(func,
580
- **request_body.to_kwargs())
601
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
602
+ get_request_thread_executor(), func, **request_body.to_kwargs())
581
603
 
582
604
  async def poll_task(request_id: str) -> bool:
583
605
  req_status = await api_requests.get_request_status_async(request_id)
sky/server/server.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import argparse
4
4
  import asyncio
5
5
  import base64
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  import contextlib
7
8
  import datetime
8
9
  import hashlib
@@ -1731,9 +1732,9 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1731
1732
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1732
1733
 
1733
1734
  # Run core.status in another thread to avoid blocking the event loop.
1734
- cluster_records = await context_utils.to_thread(core.status,
1735
- cluster_name,
1736
- all_users=True)
1735
+ with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
1736
+ cluster_records = await context_utils.to_thread_with_executor(
1737
+ thread_pool_executor, core.status, cluster_name, all_users=True)
1737
1738
  cluster_record = cluster_records[0]
1738
1739
  if cluster_record['status'] != status_lib.ClusterStatus.UP:
1739
1740
  raise fastapi.HTTPException(
@@ -1967,6 +1968,7 @@ if __name__ == '__main__':
1967
1968
  # Serve metrics on a separate port to isolate it from the application APIs:
1968
1969
  # metrics port will not be exposed to the public network typically.
1969
1970
  parser.add_argument('--metrics-port', default=9090, type=int)
1971
+ parser.add_argument('--start-with-python', action='store_true')
1970
1972
  cmd_args = parser.parse_args()
1971
1973
  if cmd_args.port == cmd_args.metrics_port:
1972
1974
  logger.error('port and metrics-port cannot be the same, exiting.')
@@ -1981,6 +1983,10 @@ if __name__ == '__main__':
1981
1983
  logger.error(f'Port {cmd_args.port} is not available, exiting.')
1982
1984
  raise RuntimeError(f'Port {cmd_args.port} is not available')
1983
1985
 
1986
+ if not cmd_args.start_with_python:
1987
+ # Maybe touch the signal file on API server startup.
1988
+ managed_job_utils.is_consolidation_mode(on_api_restart=True)
1989
+
1984
1990
  # Show the privacy policy if it is not already shown. We place it here so
1985
1991
  # that it is shown only when the API server is started.
1986
1992
  usage_lib.maybe_show_privacy_policy()
@@ -222,6 +222,7 @@ extras_require: Dict[str, List[str]] = {
222
222
  'hyperbolic': [], # No dependencies needed for hyperbolic
223
223
  'seeweb': ['ecsapi>=0.2.0'],
224
224
  'server': server_dependencies,
225
+ 'shadeform': [], # No dependencies needed for shadeform
225
226
  }
226
227
 
227
228
  # Calculate which clouds should be included in the [all] installation.
sky/sky_logging.py CHANGED
@@ -109,7 +109,6 @@ def _setup_logger():
109
109
  global _default_handler
110
110
  if _default_handler is None:
111
111
  _default_handler = EnvAwareHandler(sys.stdout)
112
- _default_handler.flush = sys.stdout.flush # type: ignore
113
112
  if env_options.Options.SHOW_DEBUG_INFO.get():
114
113
  _default_handler.setLevel(logging.DEBUG)
115
114
  else:
@@ -129,7 +128,6 @@ def _setup_logger():
129
128
  for logger_name in _SENSITIVE_LOGGER:
130
129
  logger = logging.getLogger(logger_name)
131
130
  handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
132
- handler_to_logger.flush = sys.stdout.flush # type: ignore
133
131
  logger.addHandler(handler_to_logger)
134
132
  logger.setLevel(logging.INFO)
135
133
  if _show_logging_prefix():
sky/skylet/constants.py CHANGED
@@ -226,7 +226,9 @@ RAY_INSTALLATION_COMMANDS = (
226
226
  f'{SKY_UV_PIP_CMD} list | grep "ray " | '
227
227
  f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
228
228
  f'|| {RAY_STATUS} || '
229
- f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
229
+ # The pydantic-core==2.41.3 for arm seems corrupted
230
+ # so we need to avoid that specific version.
231
+ f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
230
232
  # In some envs, e.g. pip does not have permission to write under /opt/conda
231
233
  # ray package will be installed under ~/.local/bin. If the user's PATH does
232
234
  # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -402,10 +404,25 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
402
404
  ]
403
405
  # When overriding the SkyPilot configs on the API server with the client one,
404
406
  # we skip the following keys because they are meant to be client-side configs.
405
- SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
406
- ('allowed_clouds',),
407
- ('workspaces',), ('db',),
408
- ('daemons',)]
407
+ # Also, we skip the consolidation mode config as those should be only set on
408
+ # the API server side.
409
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
410
+ ('api_server',),
411
+ ('allowed_clouds',),
412
+ ('workspaces',),
413
+ ('db',),
414
+ ('daemons',),
415
+ # TODO(kevin,tian): Override the whole controller config once our test
416
+ # infrastructure supports setting dynamic server side configs.
417
+ # Tests that are affected:
418
+ # - test_managed_jobs_ha_kill_starting
419
+ # - test_managed_jobs_ha_kill_running
420
+ # - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
421
+ # LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
422
+ # but the configs won't be applied)
423
+ ('jobs', 'controller', 'consolidation_mode'),
424
+ ('serve', 'controller', 'consolidation_mode'),
425
+ ]
409
426
 
410
427
  # Constants for Azure blob storage
411
428
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -471,7 +488,7 @@ CATALOG_DIR = '~/.sky/catalogs'
471
488
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
472
489
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
473
490
  'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
474
- 'hyperbolic', 'seeweb')
491
+ 'hyperbolic', 'seeweb', 'shadeform')
475
492
  # END constants used for service catalog.
476
493
 
477
494
  # The user ID of the SkyPilot system.
sky/skylet/log_lib.py CHANGED
@@ -271,7 +271,6 @@ def run_with_log(
271
271
  stdout, stderr = context_utils.pipe_and_wait_process(
272
272
  ctx,
273
273
  proc,
274
- cancel_callback=subprocess_utils.kill_children_processes,
275
274
  stdout_stream_handler=stdout_stream_handler,
276
275
  stderr_stream_handler=stderr_stream_handler)
277
276
  elif process_stream:
sky/skylet/log_lib.pyi CHANGED
@@ -42,7 +42,7 @@ class _ProcessingArgs:
42
42
  ...
43
43
 
44
44
 
45
- def _get_context() -> Optional[context.Context]:
45
+ def _get_context() -> Optional[context.SkyPilotContext]:
46
46
  ...
47
47
 
48
48