skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (63) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/shadeform.py +89 -0
  3. sky/authentication.py +52 -2
  4. sky/backends/backend_utils.py +35 -25
  5. sky/backends/cloud_vm_ray_backend.py +5 -5
  6. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  7. sky/catalog/kubernetes_catalog.py +19 -25
  8. sky/catalog/shadeform_catalog.py +165 -0
  9. sky/client/cli/command.py +53 -19
  10. sky/client/sdk.py +13 -1
  11. sky/clouds/__init__.py +2 -0
  12. sky/clouds/shadeform.py +393 -0
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/jobs/controller.py +122 -145
  30. sky/jobs/recovery_strategy.py +59 -82
  31. sky/jobs/scheduler.py +5 -5
  32. sky/jobs/state.py +65 -21
  33. sky/jobs/utils.py +58 -22
  34. sky/metrics/utils.py +27 -6
  35. sky/provision/__init__.py +1 -0
  36. sky/provision/kubernetes/utils.py +44 -39
  37. sky/provision/shadeform/__init__.py +11 -0
  38. sky/provision/shadeform/config.py +12 -0
  39. sky/provision/shadeform/instance.py +351 -0
  40. sky/provision/shadeform/shadeform_utils.py +83 -0
  41. sky/server/common.py +4 -2
  42. sky/server/requests/executor.py +25 -3
  43. sky/server/server.py +9 -3
  44. sky/setup_files/dependencies.py +1 -0
  45. sky/sky_logging.py +0 -2
  46. sky/skylet/constants.py +23 -6
  47. sky/skylet/log_lib.py +0 -1
  48. sky/skylet/log_lib.pyi +1 -1
  49. sky/templates/shadeform-ray.yml.j2 +72 -0
  50. sky/utils/common.py +2 -0
  51. sky/utils/context.py +57 -51
  52. sky/utils/context_utils.py +15 -11
  53. sky/utils/controller_utils.py +35 -8
  54. sky/utils/locks.py +20 -5
  55. sky/utils/subprocess_utils.py +4 -3
  56. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
  57. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
  58. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
  59. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
  60. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
  61. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
  62. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
  63. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,72 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.shadeform
11
+ region: "{{region}}"
12
+ disable_launch_config_check: true
13
+
14
+ auth:
15
+ ssh_user: shadeform
16
+ ssh_private_key: {{ssh_private_key}}
17
+ ssh_key_id: {{ssh_key_id}}
18
+
19
+ available_node_types:
20
+ ray_head_default:
21
+ {%- if custom_resources %}
22
+ resources: {{custom_resources}}
23
+ {%- else %}
24
+ resources: {}
25
+ {%- endif %}
26
+ node_config:
27
+ InstanceType: {{instance_type}}
28
+ PublicKey: |-
29
+ skypilot:ssh_public_key_content
30
+
31
+ head_node_type: ray_head_default
32
+
33
+ # Format: `REMOTE_PATH : LOCAL_PATH`
34
+ file_mounts: {
35
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
36
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
37
+ {%- for remote_path, local_path in credentials.items() %}
38
+ "{{remote_path}}": "{{local_path}}",
39
+ {%- endfor %}
40
+ }
41
+
42
+ rsync_exclude: []
43
+
44
+ initialization_commands: []
45
+
46
+ # List of shell commands to run to set up nodes.
47
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
48
+ # connection, which is expensive. Try your best to co-locate commands into fewer
49
+ # items!
50
+ #
51
+ # Increment the following for catching performance bugs easier:
52
+ # current num items (num SSH connections): 1
53
+ setup_commands:
54
+ # Create ~/.ssh/config file in case the file does not exist in the image.
55
+ # Line 'rm ..': there is another installation of pip.
56
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
57
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
58
+ # Line 'mkdir -p ..': disable host key check
59
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
60
+ - {%- for initial_setup_command in initial_setup_commands %}
61
+ {{ initial_setup_command }}
62
+ {%- endfor %}
63
+ mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
64
+ {{ conda_installation_commands }}
65
+ {{ ray_skypilot_installation_commands }}
66
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
67
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
68
+ (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
69
+ {{ ssh_max_sessions_config }}
70
+
71
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
72
+ # We do not need to list it here anymore.
sky/utils/common.py CHANGED
@@ -42,6 +42,8 @@ def refresh_server_id() -> None:
42
42
  JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
43
43
 
44
44
 
45
+ # TODO(kevin): Remove this side effect and have callers call
46
+ # refresh_server_id() explicitly as needed.
45
47
  refresh_server_id()
46
48
 
47
49
 
sky/utils/context.py CHANGED
@@ -5,13 +5,12 @@ from collections.abc import Mapping
5
5
  import contextvars
6
6
  import copy
7
7
  import functools
8
- import inspect
9
8
  import os
10
9
  import pathlib
11
10
  import subprocess
12
11
  import sys
13
- from typing import (Callable, Dict, Iterator, MutableMapping, Optional, TextIO,
14
- TYPE_CHECKING, TypeVar)
12
+ from typing import (Any, Callable, Coroutine, Dict, Iterator, MutableMapping,
13
+ Optional, TextIO, TYPE_CHECKING, TypeVar)
15
14
 
16
15
  from typing_extensions import ParamSpec
17
16
 
@@ -19,7 +18,7 @@ if TYPE_CHECKING:
19
18
  from sky.skypilot_config import ConfigContext
20
19
 
21
20
 
22
- class Context(object):
21
+ class SkyPilotContext(object):
23
22
  """SkyPilot typed context vars for threads and coroutines.
24
23
 
25
24
  This is a wrapper around `contextvars.ContextVar` that provides a typed
@@ -114,7 +113,14 @@ class Context(object):
114
113
  self._log_file_handle.close()
115
114
  self._log_file_handle = None
116
115
 
117
- def copy(self) -> 'Context':
116
+ def __enter__(self):
117
+ return self
118
+
119
+ def __exit__(self, exc_type, exc_val, exc_tb):
120
+ del exc_type, exc_val, exc_tb
121
+ self.cleanup()
122
+
123
+ def copy(self) -> 'SkyPilotContext':
118
124
  """Create a copy of the context.
119
125
 
120
126
  Changes to the current context after this call will not affect the copy.
@@ -123,18 +129,18 @@ class Context(object):
123
129
  The new context will get an independent copy of the config context.
124
130
  Cancellation of the current context will not be propagated to the copy.
125
131
  """
126
- new_context = Context()
132
+ new_context = SkyPilotContext()
127
133
  new_context.redirect_log(self._log_file)
128
134
  new_context.env_overrides = self.env_overrides.copy()
129
135
  new_context.config_context = copy.deepcopy(self.config_context)
130
136
  return new_context
131
137
 
132
138
 
133
- _CONTEXT = contextvars.ContextVar[Optional[Context]]('sky_context',
134
- default=None)
139
+ _CONTEXT = contextvars.ContextVar[Optional[SkyPilotContext]]('sky_context',
140
+ default=None)
135
141
 
136
142
 
137
- def get() -> Optional[Context]:
143
+ def get() -> Optional[SkyPilotContext]:
138
144
  """Get the current SkyPilot context.
139
145
 
140
146
  If the context is not initialized, get() will return None. This helps
@@ -200,7 +206,7 @@ class ContextualEnviron(MutableMapping[str, str]):
200
206
 
201
207
  def __iter__(self) -> Iterator[str]:
202
208
 
203
- def iter_from_context(ctx: Context) -> Iterator[str]:
209
+ def iter_from_context(ctx: SkyPilotContext) -> Iterator[str]:
204
210
  deleted_keys = set()
205
211
  for key, value in ctx.env_overrides.items():
206
212
  if value is None:
@@ -311,56 +317,56 @@ def contextual(func: Callable[P, T]) -> Callable[P, T]:
311
317
  context that inherits the values from the existing context.
312
318
  """
313
319
 
320
+ def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
321
+ # Within the new contextvars Context, set up the SkyPilotContext.
322
+ original_ctx = get()
323
+ with initialize(original_ctx):
324
+ return func(*args, **kwargs)
325
+
314
326
  @functools.wraps(func)
315
327
  def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
328
+ # Create a copy of the current contextvars Context so that setting the
329
+ # SkyPilotContext does not affect the caller's context in async
330
+ # environments.
331
+ context = contextvars.copy_context()
332
+ return context.run(run_in_context, *args, **kwargs)
333
+
334
+ return wrapper
335
+
336
+
337
+ def contextual_async(
338
+ func: Callable[P, Coroutine[Any, Any, T]]
339
+ ) -> Callable[P, Coroutine[Any, Any, T]]:
340
+ """Decorator to initialize a context before executing the function.
341
+
342
+ If a context is already initialized, this decorator will create a new
343
+ context that inherits the values from the existing context.
344
+ """
345
+
346
+ async def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
347
+ # Within the new contextvars Context, set up the SkyPilotContext.
316
348
  original_ctx = get()
317
- initialize(original_ctx)
318
- ctx = get()
319
- cleanup_after_await = False
320
-
321
- def cleanup():
322
- try:
323
- if ctx is not None:
324
- ctx.cleanup()
325
- finally:
326
- # Note: _CONTEXT.reset() is not reliable - may fail with
327
- # ValueError: <Token ... at ...> was created in a different
328
- # Context
329
- # We must make sure this happens because otherwise we may try to
330
- # write to the wrong log.
331
- _CONTEXT.set(original_ctx)
332
-
333
- # There are two cases:
334
- # 1. The function is synchronous (that is, return type is not awaitable)
335
- # In this case, we use a finally block to cleanup the context.
336
- # 2. The function is asynchronous (that is, return type is awaitable)
337
- # In this case, we need to construct an async def wrapper and await
338
- # the value, then call the cleanup function in the finally block.
339
-
340
- async def await_with_cleanup(awaitable):
341
- try:
342
- return await awaitable
343
- finally:
344
- cleanup()
345
-
346
- try:
347
- ret = func(*args, **kwargs)
348
- if inspect.isawaitable(ret):
349
- cleanup_after_await = True
350
- return await_with_cleanup(ret)
351
- else:
352
- return ret
353
- finally:
354
- if not cleanup_after_await:
355
- cleanup()
349
+ with initialize(original_ctx):
350
+ return await func(*args, **kwargs)
351
+
352
+ @functools.wraps(func)
353
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
354
+ # Create a copy of the current contextvars Context so that setting the
355
+ # SkyPilotContext does not affect the caller's context in async
356
+ # environments.
357
+ context = contextvars.copy_context()
358
+ return await context.run(run_in_context, *args, **kwargs)
356
359
 
357
360
  return wrapper
358
361
 
359
362
 
360
- def initialize(base_context: Optional[Context] = None) -> None:
363
+ def initialize(
364
+ base_context: Optional[SkyPilotContext] = None) -> SkyPilotContext:
361
365
  """Initialize the current SkyPilot context."""
362
- new_context = base_context.copy() if base_context is not None else Context()
366
+ new_context = base_context.copy(
367
+ ) if base_context is not None else SkyPilotContext()
363
368
  _CONTEXT.set(new_context)
369
+ return new_context
364
370
 
365
371
 
366
372
  class _ContextualStream:
@@ -1,5 +1,6 @@
1
1
  """Utilities for SkyPilot context."""
2
2
  import asyncio
3
+ import concurrent.futures
3
4
  import contextvars
4
5
  import functools
5
6
  import io
@@ -59,7 +60,7 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
59
60
 
60
61
 
61
62
  def pipe_and_wait_process(
62
- ctx: context.Context,
63
+ ctx: context.SkyPilotContext,
63
64
  proc: subprocess.Popen,
64
65
  poll_interval: float = 0.5,
65
66
  cancel_callback: Optional[Callable[[], None]] = None,
@@ -112,7 +113,7 @@ def pipe_and_wait_process(
112
113
  return stdout, stderr
113
114
 
114
115
 
115
- def wait_process(ctx: context.Context,
116
+ def wait_process(ctx: context.SkyPilotContext,
116
117
  proc: subprocess.Popen,
117
118
  poll_interval: float = 0.5,
118
119
  cancel_callback: Optional[Callable[[], None]] = None):
@@ -191,14 +192,17 @@ def to_thread(func: Callable[P, T], /, *args: P.args,
191
192
 
192
193
  This is same as asyncio.to_thread added in python 3.9
193
194
  """
195
+ return to_thread_with_executor(None, func, *args, **kwargs)
196
+
197
+
198
+ def to_thread_with_executor(executor: Optional[concurrent.futures.Executor],
199
+ func: Callable[P, T], /, *args: P.args,
200
+ **kwargs: P.kwargs) -> 'asyncio.Future[T]':
201
+ """Asynchronously run function *func* in a separate thread with
202
+ a custom executor."""
203
+
194
204
  loop = asyncio.get_running_loop()
195
- # This is critical to pass the current coroutine context to the new thread
196
205
  pyctx = contextvars.copy_context()
197
- func_call: Callable[..., T] = functools.partial(
198
- # partial deletes arguments type and thus can't figure out the return
199
- # type of pyctx.run
200
- pyctx.run, # type: ignore
201
- func,
202
- *args,
203
- **kwargs)
204
- return loop.run_in_executor(None, func_call)
206
+ func_call: Callable[..., T] = functools.partial(pyctx.run, func, *args,
207
+ **kwargs)
208
+ return loop.run_in_executor(executor, func_call)
@@ -72,7 +72,8 @@ class _ControllerSpec:
72
72
  """Spec for skypilot controllers."""
73
73
  controller_type: str
74
74
  name: str
75
- cluster_name: str
75
+ _cluster_name_func: Callable[[], str]
76
+ _cluster_name_from_server: Optional[str] # For client-side only
76
77
  in_progress_hint: Callable[[bool], str]
77
78
  decline_cancel_hint: str
78
79
  _decline_down_when_failed_to_fetch_status_hint: str
@@ -93,6 +94,24 @@ class _ControllerSpec:
93
94
  return self._check_cluster_name_hint.format(
94
95
  cluster_name=self.cluster_name)
95
96
 
97
+ @property
98
+ def cluster_name(self) -> str:
99
+ """The cluster name of the controller.
100
+
101
+ On the server-side, the cluster name is the actual cluster name,
102
+ which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
103
+
104
+ On the client-side, the cluster name may not be accurate,
105
+ as we may not know the exact name, because we are missing
106
+ the server-side common.SERVER_ID. We have to wait until
107
+ we get the actual cluster name from the server.
108
+ """
109
+ return (self._cluster_name_from_server if self._cluster_name_from_server
110
+ is not None else self._cluster_name_func())
111
+
112
+ def set_cluster_name_from_server(self, cluster_name: str) -> None:
113
+ self._cluster_name_from_server = cluster_name
114
+
96
115
 
97
116
  # TODO: refactor controller class to not be an enum.
98
117
  class Controllers(enum.Enum):
@@ -102,7 +121,8 @@ class Controllers(enum.Enum):
102
121
  JOBS_CONTROLLER = _ControllerSpec(
103
122
  controller_type='jobs',
104
123
  name='managed jobs controller',
105
- cluster_name=common.JOB_CONTROLLER_NAME,
124
+ _cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
125
+ _cluster_name_from_server=None,
106
126
  in_progress_hint=lambda _:
107
127
  ('* {job_info}To see all managed jobs: '
108
128
  f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
@@ -133,7 +153,8 @@ class Controllers(enum.Enum):
133
153
  SKY_SERVE_CONTROLLER = _ControllerSpec(
134
154
  controller_type='serve',
135
155
  name='serve controller',
136
- cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
156
+ _cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
157
+ _cluster_name_from_server=None,
137
158
  in_progress_hint=(
138
159
  lambda pool:
139
160
  (f'* To see detailed pool status: {colorama.Style.BRIGHT}'
@@ -166,7 +187,9 @@ class Controllers(enum.Enum):
166
187
  default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
167
188
 
168
189
  @classmethod
169
- def from_name(cls, name: Optional[str]) -> Optional['Controllers']:
190
+ def from_name(cls,
191
+ name: Optional[str],
192
+ expect_exact_match: bool = True) -> Optional['Controllers']:
170
193
  """Check if the cluster name is a controller name.
171
194
 
172
195
  Returns:
@@ -187,7 +210,11 @@ class Controllers(enum.Enum):
187
210
  elif name.startswith(common.JOB_CONTROLLER_PREFIX):
188
211
  controller = cls.JOBS_CONTROLLER
189
212
  prefix = common.JOB_CONTROLLER_PREFIX
190
- if controller is not None and name != controller.value.cluster_name:
213
+
214
+ if controller is not None and expect_exact_match:
215
+ assert name == controller.value.cluster_name, (
216
+ name, controller.value.cluster_name)
217
+ elif controller is not None and name != controller.value.cluster_name:
191
218
  # The client-side cluster_name is not accurate. Assume that `name`
192
219
  # is the actual cluster name, so need to set the controller's
193
220
  # cluster name to the input name.
@@ -201,7 +228,7 @@ class Controllers(enum.Enum):
201
228
  prefix)
202
229
 
203
230
  # Update the cluster name.
204
- controller.value.cluster_name = name
231
+ controller.value.set_cluster_name_from_server(name)
205
232
  return controller
206
233
 
207
234
  @classmethod
@@ -228,7 +255,7 @@ def get_controller_for_pool(pool: bool) -> Controllers:
228
255
  def high_availability_specified(cluster_name: Optional[str]) -> bool:
229
256
  """Check if the controller high availability is specified in user config.
230
257
  """
231
- controller = Controllers.from_name(cluster_name)
258
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
232
259
  if controller is None:
233
260
  return False
234
261
 
@@ -411,7 +438,7 @@ def check_cluster_name_not_controller(
411
438
  Returns:
412
439
  None, if the cluster name is not a controller name.
413
440
  """
414
- controller = Controllers.from_name(cluster_name)
441
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
415
442
  if controller is not None:
416
443
  msg = controller.value.check_cluster_name_hint
417
444
  if operation_str is not None:
sky/utils/locks.py CHANGED
@@ -243,6 +243,7 @@ class PostgresLock(DistributedLock):
243
243
  if not self._acquired or not self._connection:
244
244
  return
245
245
 
246
+ connection_lost = False
246
247
  try:
247
248
  cursor = self._connection.cursor()
248
249
  cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
@@ -252,8 +253,11 @@ class PostgresLock(DistributedLock):
252
253
  # Lost connection to the database, likely the lock is force unlocked
253
254
  # by other routines.
254
255
  logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
256
+ connection_lost = True
255
257
  finally:
256
- self._close_connection()
258
+ # Invalidate if connection was lost to prevent SQLAlchemy from
259
+ # trying to reset a dead connection
260
+ self._close_connection(invalidate=connection_lost)
257
261
 
258
262
  def force_unlock(self) -> None:
259
263
  """Force unlock the postgres advisory lock."""
@@ -292,13 +296,24 @@ class PostgresLock(DistributedLock):
292
296
  finally:
293
297
  self._close_connection()
294
298
 
295
- def _close_connection(self) -> None:
296
- """Close the postgres connection."""
299
+ def _close_connection(self, invalidate: bool = False) -> None:
300
+ """Close the postgres connection.
301
+
302
+ Args:
303
+ invalidate: If True, invalidate connection instead of closing it.
304
+ Use this when the connection might be broken (e.g., after
305
+ pg_terminate_backend) to prevent SQLAlchemy from trying to
306
+ reset it (which would result in an error being logged).
307
+ """
297
308
  if self._connection:
298
309
  try:
299
- self._connection.close()
310
+ if invalidate:
311
+ self._connection.invalidate()
312
+ else:
313
+ self._connection.close()
300
314
  except Exception as e: # pylint: disable=broad-except
301
- logger.debug(f'Failed to close postgres connection: {e}')
315
+ logger.debug(
316
+ f'Failed to invalidate or close postgres connection: {e}')
302
317
  self._connection = None
303
318
 
304
319
  def is_locked(self) -> bool:
@@ -10,7 +10,8 @@ import sys
10
10
  import threading
11
11
  import time
12
12
  import typing
13
- from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
13
+ from typing import (Any, Callable, Dict, List, Optional, Protocol, Set, Tuple,
14
+ Union)
14
15
 
15
16
  import colorama
16
17
 
@@ -107,7 +108,7 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
107
108
 
108
109
 
109
110
  def run_in_parallel(func: Callable,
110
- args: List[Any],
111
+ args: Union[List[Any], Set[Any]],
111
112
  num_threads: Optional[int] = None) -> List[Any]:
112
113
  """Run a function in parallel on a list of arguments.
113
114
 
@@ -128,7 +129,7 @@ def run_in_parallel(func: Callable,
128
129
  if len(args) == 0:
129
130
  return []
130
131
  if len(args) == 1:
131
- return [func(args[0])]
132
+ return [func(list(args)[0])]
132
133
 
133
134
  processes = (num_threads
134
135
  if num_threads is not None else get_parallel_threads())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20251012
3
+ Version: 1.0.0.dev20251014
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -154,52 +154,53 @@ Requires-Dist: grpcio>=1.63.0; extra == "server"
154
154
  Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
155
155
  Requires-Dist: aiosqlite; extra == "server"
156
156
  Requires-Dist: greenlet; extra == "server"
157
+ Provides-Extra: shadeform
157
158
  Provides-Extra: all
159
+ Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
160
+ Requires-Dist: botocore>=1.29.10; extra == "all"
161
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
162
+ Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
163
+ Requires-Dist: passlib; extra == "all"
164
+ Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
165
+ Requires-Dist: awscli>=1.27.10; extra == "all"
166
+ Requires-Dist: azure-common; extra == "all"
167
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
168
+ Requires-Dist: colorama<0.4.5; extra == "all"
169
+ Requires-Dist: pyjwt; extra == "all"
170
+ Requires-Dist: sqlalchemy_adapter; extra == "all"
171
+ Requires-Dist: ray[default]>=2.6.1; extra == "all"
172
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
173
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
174
+ Requires-Dist: websockets; extra == "all"
158
175
  Requires-Dist: anyio; extra == "all"
159
- Requires-Dist: nebius>=0.2.47; extra == "all"
176
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
177
+ Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
160
178
  Requires-Dist: ecsapi>=0.2.0; extra == "all"
161
- Requires-Dist: ibm-cos-sdk; extra == "all"
179
+ Requires-Dist: msgraph-sdk; extra == "all"
162
180
  Requires-Dist: python-dateutil; extra == "all"
181
+ Requires-Dist: msrestazure; extra == "all"
182
+ Requires-Dist: nebius>=0.2.47; extra == "all"
183
+ Requires-Dist: ibm-vpc; extra == "all"
184
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
185
+ Requires-Dist: ibm-cloud-sdk-core; extra == "all"
163
186
  Requires-Dist: azure-core>=1.31.0; extra == "all"
164
- Requires-Dist: aiosqlite; extra == "all"
165
- Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
166
- Requires-Dist: cudo-compute>=0.1.10; extra == "all"
167
- Requires-Dist: pydo>=0.3.0; extra == "all"
168
187
  Requires-Dist: casbin; extra == "all"
169
- Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
170
- Requires-Dist: boto3>=1.26.1; extra == "all"
171
- Requires-Dist: sqlalchemy_adapter; extra == "all"
172
- Requires-Dist: passlib; extra == "all"
173
- Requires-Dist: greenlet; extra == "all"
174
- Requires-Dist: msrestazure; extra == "all"
175
- Requires-Dist: colorama<0.4.5; extra == "all"
176
- Requires-Dist: azure-common; extra == "all"
177
- Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
178
- Requires-Dist: websockets; extra == "all"
179
- Requires-Dist: tomli; python_version < "3.11" and extra == "all"
180
- Requires-Dist: ray[default]>=2.6.1; extra == "all"
181
- Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
182
- Requires-Dist: google-cloud-storage; extra == "all"
183
- Requires-Dist: docker; extra == "all"
184
188
  Requires-Dist: grpcio>=1.63.0; extra == "all"
185
- Requires-Dist: msgraph-sdk; extra == "all"
186
- Requires-Dist: ibm-vpc; extra == "all"
187
- Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
188
- Requires-Dist: pyjwt; extra == "all"
189
- Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
190
- Requires-Dist: botocore>=1.29.10; extra == "all"
191
- Requires-Dist: azure-cli>=2.65.0; extra == "all"
189
+ Requires-Dist: docker; extra == "all"
192
190
  Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
191
+ Requires-Dist: aiohttp; extra == "all"
192
+ Requires-Dist: tomli; python_version < "3.11" and extra == "all"
193
+ Requires-Dist: google-cloud-storage; extra == "all"
194
+ Requires-Dist: pydo>=0.3.0; extra == "all"
193
195
  Requires-Dist: oci; extra == "all"
194
- Requires-Dist: awscli>=1.27.10; extra == "all"
195
- Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
196
- Requires-Dist: azure-core>=1.24.0; extra == "all"
197
- Requires-Dist: azure-identity>=1.19.0; extra == "all"
198
196
  Requires-Dist: runpod>=1.6.1; extra == "all"
199
- Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
200
- Requires-Dist: ibm-cloud-sdk-core; extra == "all"
201
- Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
202
- Requires-Dist: aiohttp; extra == "all"
197
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
198
+ Requires-Dist: cudo-compute>=0.1.10; extra == "all"
199
+ Requires-Dist: ibm-cos-sdk; extra == "all"
200
+ Requires-Dist: aiosqlite; extra == "all"
201
+ Requires-Dist: boto3>=1.26.1; extra == "all"
202
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
203
+ Requires-Dist: greenlet; extra == "all"
203
204
  Dynamic: author
204
205
  Dynamic: classifier
205
206
  Dynamic: description
@@ -249,7 +250,7 @@ Dynamic: summary
249
250
  ----
250
251
 
251
252
  :fire: *News* :fire:
252
- - [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./llm/torchtitan/)
253
+ - [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./examples/training/torchtitan)
253
254
  - [Sep 2025] Scaling AI infrastructure at Abridge - **10x faster development** with SkyPilot: [**blog**](https://blog.skypilot.co/abridge/)
254
255
  - [Sep 2025] Network and Storage Benchmarks for LLM training on the cloud: [**blog**](https://maknee.github.io/blog/2025/Network-And-Storage-Training-Skypilot/)
255
256
  - [Aug 2025] Serve and finetune **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**serve**](./llm/gpt-oss/) + [**LoRA and full finetuning**](./llm/gpt-oss-finetuning/)