skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +160 -23
  4. sky/backends/cloud_vm_ray_backend.py +226 -74
  5. sky/catalog/__init__.py +7 -0
  6. sky/catalog/aws_catalog.py +4 -0
  7. sky/catalog/common.py +18 -0
  8. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  9. sky/client/cli/command.py +2 -71
  10. sky/client/sdk.py +20 -0
  11. sky/client/sdk_async.py +23 -18
  12. sky/clouds/aws.py +26 -6
  13. sky/clouds/cloud.py +8 -0
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
  17. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  18. sky/dashboard/out/clusters/[cluster].html +1 -1
  19. sky/dashboard/out/clusters.html +1 -1
  20. sky/dashboard/out/config.html +1 -1
  21. sky/dashboard/out/index.html +1 -1
  22. sky/dashboard/out/infra/[context].html +1 -1
  23. sky/dashboard/out/infra.html +1 -1
  24. sky/dashboard/out/jobs/[job].html +1 -1
  25. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  26. sky/dashboard/out/jobs.html +1 -1
  27. sky/dashboard/out/users.html +1 -1
  28. sky/dashboard/out/volumes.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/data/storage.py +5 -1
  33. sky/execution.py +21 -14
  34. sky/global_user_state.py +34 -0
  35. sky/jobs/client/sdk_async.py +4 -2
  36. sky/jobs/constants.py +3 -0
  37. sky/jobs/controller.py +734 -310
  38. sky/jobs/recovery_strategy.py +251 -129
  39. sky/jobs/scheduler.py +247 -174
  40. sky/jobs/server/core.py +20 -4
  41. sky/jobs/server/utils.py +2 -2
  42. sky/jobs/state.py +709 -508
  43. sky/jobs/utils.py +90 -40
  44. sky/logs/agent.py +10 -2
  45. sky/provision/aws/config.py +4 -1
  46. sky/provision/gcp/config.py +6 -1
  47. sky/provision/kubernetes/config.py +7 -2
  48. sky/provision/kubernetes/instance.py +84 -41
  49. sky/provision/kubernetes/utils.py +17 -8
  50. sky/provision/provisioner.py +1 -0
  51. sky/provision/vast/instance.py +1 -1
  52. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  53. sky/serve/replica_managers.py +0 -7
  54. sky/serve/serve_utils.py +5 -0
  55. sky/serve/server/impl.py +1 -2
  56. sky/serve/service.py +0 -2
  57. sky/server/common.py +8 -3
  58. sky/server/config.py +55 -27
  59. sky/server/constants.py +1 -0
  60. sky/server/daemons.py +7 -11
  61. sky/server/metrics.py +41 -8
  62. sky/server/requests/executor.py +41 -4
  63. sky/server/requests/serializers/encoders.py +1 -1
  64. sky/server/server.py +9 -1
  65. sky/server/uvicorn.py +11 -5
  66. sky/setup_files/dependencies.py +4 -2
  67. sky/skylet/attempt_skylet.py +1 -0
  68. sky/skylet/constants.py +14 -7
  69. sky/skylet/events.py +2 -10
  70. sky/skylet/log_lib.py +11 -0
  71. sky/skylet/log_lib.pyi +9 -0
  72. sky/task.py +62 -0
  73. sky/templates/kubernetes-ray.yml.j2 +120 -3
  74. sky/utils/accelerator_registry.py +3 -1
  75. sky/utils/command_runner.py +35 -11
  76. sky/utils/command_runner.pyi +25 -3
  77. sky/utils/common_utils.py +11 -1
  78. sky/utils/context_utils.py +15 -2
  79. sky/utils/controller_utils.py +5 -0
  80. sky/utils/db/db_utils.py +31 -2
  81. sky/utils/db/migration_utils.py +1 -1
  82. sky/utils/git.py +559 -1
  83. sky/utils/resource_checker.py +8 -7
  84. sky/utils/rich_utils.py +3 -1
  85. sky/utils/subprocess_utils.py +9 -0
  86. sky/volumes/volume.py +2 -0
  87. sky/workspaces/core.py +57 -21
  88. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
  89. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
  90. sky/client/cli/git.py +0 -549
  91. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  92. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
  93. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
  94. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
  95. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/workspaces/core.py CHANGED
@@ -14,6 +14,7 @@ from sky.backends import backend_utils
14
14
  from sky.skylet import constants
15
15
  from sky.usage import usage_lib
16
16
  from sky.users import permission
17
+ from sky.users import rbac
17
18
  from sky.utils import annotations
18
19
  from sky.utils import common_utils
19
20
  from sky.utils import config_utils
@@ -147,11 +148,15 @@ def _compare_workspace_configs(
147
148
  private_new = new_config.get('private', False)
148
149
  private_changed = private_old != private_new
149
150
 
151
+ admin_user_ids = permission.permission_service.get_users_for_role(
152
+ rbac.RoleName.ADMIN.value)
150
153
  # Get allowed users (resolve to user IDs for comparison)
151
154
  allowed_users_old = workspaces_utils.get_workspace_users(
152
155
  current_config) if private_old else []
156
+ allowed_users_old += admin_user_ids
153
157
  allowed_users_new = workspaces_utils.get_workspace_users(
154
158
  new_config) if private_new else []
159
+ allowed_users_new += admin_user_ids
155
160
 
156
161
  # Convert to sets for easier comparison
157
162
  old_users_set = set(allowed_users_old)
@@ -188,6 +193,24 @@ def _compare_workspace_configs(
188
193
  added_users=added_users)
189
194
 
190
195
 
196
+ def _validate_workspace_config_changes_with_lock(
197
+ workspace_name: str, current_config: Dict[str, Any],
198
+ new_config: Dict[str, Any]) -> None:
199
+ lock_id = backend_utils.workspace_lock_id(workspace_name)
200
+ lock_timeout = backend_utils.WORKSPACE_LOCK_TIMEOUT_SECONDS
201
+ try:
202
+ with locks.get_lock(lock_id, lock_timeout):
203
+ # Validate the configuration changes based on active resources
204
+ _validate_workspace_config_changes(workspace_name, current_config,
205
+ new_config)
206
+ except locks.LockTimeout as e:
207
+ raise RuntimeError(
208
+ f'Failed to validate workspace {workspace_name!r} due to '
209
+ 'a timeout when trying to access database. Please '
210
+ f'try again or manually remove the lock at {lock_id}. '
211
+ f'{common_utils.format_exception(e)}') from None
212
+
213
+
191
214
  def _validate_workspace_config_changes(workspace_name: str,
192
215
  current_config: Dict[str, Any],
193
216
  new_config: Dict[str, Any]) -> None:
@@ -232,7 +255,7 @@ def _validate_workspace_config_changes(workspace_name: str,
232
255
  f' private. Checking that all active resources belong'
233
256
  f' to allowed users.')
234
257
 
235
- error_summary, missed_users_names = (
258
+ error_summary, missed_users_names, _ = (
236
259
  resource_checker.check_users_workspaces_active_resources(
237
260
  config_comparison.allowed_users_new, [workspace_name]))
238
261
  if error_summary:
@@ -259,11 +282,35 @@ def _validate_workspace_config_changes(workspace_name: str,
259
282
  f'Checking that removed users'
260
283
  f' {config_comparison.removed_users} do not have'
261
284
  f' active resources in workspace {workspace_name!r}.')
262
- user_operations = []
263
- for user_id in config_comparison.removed_users:
264
- user_operations.append((user_id, 'remove'))
265
- resource_checker.check_no_active_resources_for_users(
266
- user_operations)
285
+ error_summary, missed_users_names, missed_user_dict = (
286
+ resource_checker.check_users_workspaces_active_resources(
287
+ config_comparison.allowed_users_new, [workspace_name]))
288
+ if error_summary:
289
+ error_user_ids = []
290
+ for user_id in config_comparison.removed_users:
291
+ if user_id in missed_user_dict:
292
+ error_user_ids.append(user_id)
293
+ error_user_names = []
294
+ if error_user_ids:
295
+ error_user_names = [
296
+ missed_user_dict[user_id]
297
+ for user_id in error_user_ids
298
+ ]
299
+
300
+ error_msg = 'Cannot '
301
+ error_users_list = ', '.join(error_user_names)
302
+ if len(error_user_names) == 1:
303
+ error_msg += f'remove user {error_users_list!r} ' \
304
+ f'from workspace {workspace_name!r} because the ' \
305
+ f'user has {error_summary}'
306
+ else:
307
+ error_msg += f'remove users {error_users_list!r}' \
308
+ f' from workspace {workspace_name!r} because the' \
309
+ f' users have {error_summary}'
310
+ error_msg += ', but not in the allowed_users list.' \
311
+ ' Please either add the users to allowed_users or' \
312
+ ' ask them to terminate their resources.'
313
+ raise ValueError(error_msg)
267
314
  else:
268
315
  # Other configuration changes - check that workspace has no active
269
316
  # resources
@@ -310,20 +357,8 @@ def update_workspace(workspace_name: str, config: Dict[str,
310
357
  default_value={})
311
358
  current_config = current_workspaces.get(workspace_name, {})
312
359
 
313
- if current_config:
314
- lock_id = backend_utils.workspace_lock_id(workspace_name)
315
- lock_timeout = backend_utils.WORKSPACE_LOCK_TIMEOUT_SECONDS
316
- try:
317
- with locks.get_lock(lock_id, lock_timeout):
318
- # Validate the configuration changes based on active resources
319
- _validate_workspace_config_changes(workspace_name,
320
- current_config, config)
321
- except locks.LockTimeout as e:
322
- raise RuntimeError(
323
- f'Failed to validate workspace {workspace_name!r} due to '
324
- 'a timeout when trying to access database. Please '
325
- f'try again or manually remove the lock at {lock_id}. '
326
- f'{common_utils.format_exception(e)}') from None
360
+ _validate_workspace_config_changes_with_lock(workspace_name, current_config,
361
+ config)
327
362
 
328
363
  def update_workspace_fn(workspaces: Dict[str, Any]) -> None:
329
364
  """Function to update workspace inside the lock."""
@@ -510,7 +545,8 @@ def update_config(config: Dict[str, Any]) -> Dict[str, Any]:
510
545
  # If workspace configuration is changing, validate and mark for checking
511
546
  if current_workspace_config != new_workspace_config:
512
547
  _validate_workspace_config(workspace_name, new_workspace_config)
513
- workspaces_to_check.append((workspace_name, 'update'))
548
+ _validate_workspace_config_changes_with_lock(
549
+ workspace_name, current_workspace_config, new_workspace_config)
514
550
  users = workspaces_utils.get_workspace_users(new_workspace_config)
515
551
  workspaces_to_check_policy['update'][workspace_name] = users
516
552
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250909
3
+ Version: 1.0.0.dev20250912
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -53,6 +53,8 @@ Requires-Dist: httpx
53
53
  Requires-Dist: setproctitle
54
54
  Requires-Dist: sqlalchemy
55
55
  Requires-Dist: psycopg2-binary
56
+ Requires-Dist: aiosqlite
57
+ Requires-Dist: asyncpg
56
58
  Requires-Dist: casbin
57
59
  Requires-Dist: sqlalchemy_adapter
58
60
  Requires-Dist: prometheus_client>=0.8.0
@@ -88,10 +90,10 @@ Requires-Dist: ibm-cloud-sdk-core; extra == "ibm"
88
90
  Requires-Dist: ibm-vpc; extra == "ibm"
89
91
  Requires-Dist: ibm-platform-services>=0.48.0; extra == "ibm"
90
92
  Requires-Dist: ibm-cos-sdk; extra == "ibm"
91
- Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "ibm"
93
+ Requires-Dist: ray[default]>=2.6.1; extra == "ibm"
92
94
  Provides-Extra: docker
93
95
  Requires-Dist: docker; extra == "docker"
94
- Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "docker"
96
+ Requires-Dist: ray[default]>=2.6.1; extra == "docker"
95
97
  Provides-Extra: lambda
96
98
  Provides-Extra: cloudflare
97
99
  Requires-Dist: awscli>=1.27.10; extra == "cloudflare"
@@ -99,7 +101,7 @@ Requires-Dist: botocore>=1.29.10; extra == "cloudflare"
99
101
  Requires-Dist: boto3>=1.26.1; extra == "cloudflare"
100
102
  Requires-Dist: colorama<0.4.5; extra == "cloudflare"
101
103
  Provides-Extra: scp
102
- Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "scp"
104
+ Requires-Dist: ray[default]>=2.6.1; extra == "scp"
103
105
  Provides-Extra: oci
104
106
  Requires-Dist: oci; extra == "oci"
105
107
  Provides-Extra: kubernetes
@@ -145,48 +147,48 @@ Requires-Dist: grpcio>=1.63.0; extra == "server"
145
147
  Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
146
148
  Requires-Dist: aiosqlite; extra == "server"
147
149
  Provides-Extra: all
150
+ Requires-Dist: oci; extra == "all"
151
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
152
+ Requires-Dist: msgraph-sdk; extra == "all"
153
+ Requires-Dist: ibm-cos-sdk; extra == "all"
148
154
  Requires-Dist: passlib; extra == "all"
149
- Requires-Dist: boto3>=1.26.1; extra == "all"
150
- Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
155
+ Requires-Dist: cudo-compute>=0.1.10; extra == "all"
156
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
157
+ Requires-Dist: colorama<0.4.5; extra == "all"
151
158
  Requires-Dist: awscli>=1.27.10; extra == "all"
152
- Requires-Dist: runpod>=1.6.1; extra == "all"
153
- Requires-Dist: python-dateutil; extra == "all"
159
+ Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
160
+ Requires-Dist: ibm-cloud-sdk-core; extra == "all"
161
+ Requires-Dist: casbin; extra == "all"
154
162
  Requires-Dist: anyio; extra == "all"
155
- Requires-Dist: oci; extra == "all"
163
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
164
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
165
+ Requires-Dist: botocore>=1.29.10; extra == "all"
166
+ Requires-Dist: pydo>=0.3.0; extra == "all"
167
+ Requires-Dist: grpcio>=1.63.0; extra == "all"
168
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
169
+ Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
170
+ Requires-Dist: ray[default]>=2.6.1; extra == "all"
171
+ Requires-Dist: boto3>=1.26.1; extra == "all"
172
+ Requires-Dist: docker; extra == "all"
156
173
  Requires-Dist: ibm-vpc; extra == "all"
157
- Requires-Dist: pyjwt; extra == "all"
158
- Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
159
- Requires-Dist: casbin; extra == "all"
174
+ Requires-Dist: aiosqlite; extra == "all"
175
+ Requires-Dist: websockets; extra == "all"
176
+ Requires-Dist: azure-core>=1.31.0; extra == "all"
160
177
  Requires-Dist: nebius>=0.2.47; extra == "all"
161
178
  Requires-Dist: azure-common; extra == "all"
162
- Requires-Dist: azure-core>=1.31.0; extra == "all"
163
- Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
164
- Requires-Dist: pydo>=0.3.0; extra == "all"
165
- Requires-Dist: aiosqlite; extra == "all"
166
179
  Requires-Dist: google-cloud-storage; extra == "all"
167
- Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
168
- Requires-Dist: colorama<0.4.5; extra == "all"
169
- Requires-Dist: azure-cli>=2.65.0; extra == "all"
170
- Requires-Dist: grpcio>=1.63.0; extra == "all"
171
- Requires-Dist: ibm-cloud-sdk-core; extra == "all"
180
+ Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
181
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
182
+ Requires-Dist: python-dateutil; extra == "all"
183
+ Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
184
+ Requires-Dist: sqlalchemy_adapter; extra == "all"
185
+ Requires-Dist: runpod>=1.6.1; extra == "all"
172
186
  Requires-Dist: msrestazure; extra == "all"
173
- Requires-Dist: azure-identity>=1.19.0; extra == "all"
174
- Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
175
- Requires-Dist: msgraph-sdk; extra == "all"
176
- Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
177
- Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
178
- Requires-Dist: docker; extra == "all"
179
- Requires-Dist: ibm-cos-sdk; extra == "all"
187
+ Requires-Dist: pyjwt; extra == "all"
180
188
  Requires-Dist: aiohttp; extra == "all"
181
- Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
189
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
182
190
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
183
- Requires-Dist: websockets; extra == "all"
184
- Requires-Dist: botocore>=1.29.10; extra == "all"
185
- Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
186
- Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
187
- Requires-Dist: sqlalchemy_adapter; extra == "all"
188
- Requires-Dist: azure-core>=1.24.0; extra == "all"
189
- Requires-Dist: cudo-compute>=0.1.10; extra == "all"
191
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
190
192
  Dynamic: author
191
193
  Dynamic: classifier
192
194
  Dynamic: description