skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py CHANGED
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
100
100
  # cluster yaml is updated.
101
101
  #
102
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
103
- SKYLET_VERSION = '22'
103
+ SKYLET_VERSION = '25'
104
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
105
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
106
106
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -226,7 +226,9 @@ RAY_INSTALLATION_COMMANDS = (
226
226
  f'{SKY_UV_PIP_CMD} list | grep "ray " | '
227
227
  f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
228
228
  f'|| {RAY_STATUS} || '
229
- f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
229
+ # The pydantic-core==2.41.3 for arm seems corrupted
230
+ # so we need to avoid that specific version.
231
+ f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
230
232
  # In some envs, e.g. pip does not have permission to write under /opt/conda
231
233
  # ray package will be installed under ~/.local/bin. If the user's PATH does
232
234
  # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -402,10 +404,27 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
402
404
  ]
403
405
  # When overriding the SkyPilot configs on the API server with the client one,
404
406
  # we skip the following keys because they are meant to be client-side configs.
405
- SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
406
- ('allowed_clouds',),
407
- ('workspaces',), ('db',),
408
- ('daemons',)]
407
+ # Also, we skip the consolidation mode config as those should be only set on
408
+ # the API server side.
409
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
410
+ ('api_server',),
411
+ ('allowed_clouds',),
412
+ ('workspaces',),
413
+ ('db',),
414
+ ('daemons',),
415
+ # TODO(kevin,tian): Override the whole controller config once our test
416
+ # infrastructure supports setting dynamic server side configs.
417
+ # Tests that are affected:
418
+ # - test_managed_jobs_ha_kill_starting
419
+ # - test_managed_jobs_ha_kill_running
420
+ # - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
421
+ # LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
422
+ # but the configs won't be applied)
423
+ ('jobs', 'controller', 'consolidation_mode'),
424
+ ('serve', 'controller', 'consolidation_mode'),
425
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
426
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
427
+ ]
409
428
 
410
429
  # Constants for Azure blob storage
411
430
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -455,6 +474,7 @@ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
455
474
  # authentication is enabled in the API server.
456
475
  ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
457
476
  SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
477
+ SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
458
478
  ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
459
479
 
460
480
  # Enable debug logging for requests.
@@ -471,7 +491,7 @@ CATALOG_DIR = '~/.sky/catalogs'
471
491
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
472
492
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
473
493
  'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
474
- 'hyperbolic', 'seeweb')
494
+ 'hyperbolic', 'seeweb', 'shadeform')
475
495
  # END constants used for service catalog.
476
496
 
477
497
  # The user ID of the SkyPilot system.
@@ -531,3 +551,6 @@ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
531
551
 
532
552
  ARM64_ARCH = 'arm64'
533
553
  X86_64_ARCH = 'x86_64'
554
+
555
+ SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
556
+ f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
sky/skylet/events.py CHANGED
@@ -326,8 +326,15 @@ class AutostopEvent(SkyletEvent):
326
326
  cluster_name_on_cloud = cluster_config['cluster_name']
327
327
  is_cluster_multinode = cluster_config['max_workers'] > 0
328
328
 
329
+ # Clear AWS credentials from environment to force boto3 to use IAM
330
+ # role attached to the instance (lowest priority in credential chain).
331
+ # This allows the cluster to stop/terminate itself using its IAM role.
329
332
  os.environ.pop('AWS_ACCESS_KEY_ID', None)
330
333
  os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
334
+ os.environ.pop('AWS_SESSION_TOKEN', None)
335
+ # Point boto3 to /dev/null to skip reading credentials from files.
336
+ os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
337
+ os.environ['AWS_CONFIG_FILE'] = '/dev/null'
331
338
 
332
339
  # Stop the ray autoscaler to avoid scaling up, during
333
340
  # stopping/terminating of the cluster.
sky/skylet/log_lib.py CHANGED
@@ -220,7 +220,14 @@ def run_with_log(
220
220
  stdin=stdin,
221
221
  **kwargs) as proc:
222
222
  try:
223
- subprocess_utils.kill_process_daemon(proc.pid)
223
+ if ctx is not None:
224
+ # When runs in coroutine, use kill_pg if available to avoid
225
+ # the overhead of refreshing the process tree in the daemon.
226
+ subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
227
+ else:
228
+ # For backward compatibility, do not specify use_kill_pg by
229
+ # default.
230
+ subprocess_utils.kill_process_daemon(proc.pid)
224
231
  stdout = ''
225
232
  stderr = ''
226
233
  stdout_stream_handler = None
@@ -271,7 +278,6 @@ def run_with_log(
271
278
  stdout, stderr = context_utils.pipe_and_wait_process(
272
279
  ctx,
273
280
  proc,
274
- cancel_callback=subprocess_utils.kill_children_processes,
275
281
  stdout_stream_handler=stdout_stream_handler,
276
282
  stderr_stream_handler=stderr_stream_handler)
277
283
  elif process_stream:
sky/skylet/log_lib.pyi CHANGED
@@ -42,7 +42,7 @@ class _ProcessingArgs:
42
42
  ...
43
43
 
44
44
 
45
- def _get_context() -> Optional[context.Context]:
45
+ def _get_context() -> Optional[context.SkyPilotContext]:
46
46
  ...
47
47
 
48
48
 
sky/skylet/services.py CHANGED
@@ -216,10 +216,12 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
216
216
  if pool is not None:
217
217
  pool_hash = serve_state.get_service_hash(pool)
218
218
  # Add the managed job to job queue database.
219
+ user_id = managed_job.user_id if managed_job.HasField(
220
+ 'user_id') else None
219
221
  managed_job_state.set_job_info(job_id, managed_job.name,
220
222
  managed_job.workspace,
221
223
  managed_job.entrypoint, pool,
222
- pool_hash)
224
+ pool_hash, user_id)
223
225
  # Set the managed job to PENDING state to make sure that
224
226
  # this managed job appears in the `sky jobs queue`, even
225
227
  # if it needs to wait to be submitted.
@@ -405,18 +407,22 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
405
407
  context: grpc.ServicerContext
406
408
  ) -> managed_jobsv1_pb2.GetJobTableResponse:
407
409
  try:
408
- accessible_workspaces = list(request.accessible_workspaces)
409
- job_ids = list(request.job_ids.ids) if request.job_ids else None
410
+ accessible_workspaces = (
411
+ list(request.accessible_workspaces.workspaces)
412
+ if request.HasField('accessible_workspaces') else None)
413
+ job_ids = (list(request.job_ids.ids)
414
+ if request.HasField('job_ids') else None)
410
415
  user_hashes: Optional[List[Optional[str]]] = None
411
- if request.user_hashes:
416
+ if request.HasField('user_hashes'):
412
417
  user_hashes = list(request.user_hashes.hashes)
413
418
  # For backwards compatibility, we show jobs that do not have a
414
419
  # user_hash. TODO: Remove before 0.12.0.
415
420
  if request.show_jobs_without_user_hash:
416
421
  user_hashes.append(None)
417
- statuses = list(
418
- request.statuses.statuses) if request.statuses else None
419
-
422
+ statuses = (list(request.statuses.statuses)
423
+ if request.HasField('statuses') else None)
424
+ fields = (list(request.fields.fields)
425
+ if request.HasField('fields') else None)
420
426
  job_queue = managed_job_utils.get_managed_job_queue(
421
427
  skip_finished=request.skip_finished,
422
428
  accessible_workspaces=accessible_workspaces,
@@ -430,7 +436,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
430
436
  page=request.page if request.HasField('page') else None,
431
437
  limit=request.limit if request.HasField('limit') else None,
432
438
  user_hashes=user_hashes,
433
- statuses=statuses)
439
+ statuses=statuses,
440
+ fields=fields,
441
+ )
434
442
  jobs = job_queue['jobs']
435
443
  total = job_queue['total']
436
444
  total_no_filter = job_queue['total_no_filter']
@@ -438,7 +446,16 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
438
446
 
439
447
  jobs_info = []
440
448
  for job in jobs:
449
+ converted_metadata = None
450
+ metadata = job.get('metadata')
451
+ if metadata:
452
+ converted_metadata = {
453
+ k: v for k, v in metadata.items() if v is not None
454
+ }
441
455
  job_info = managed_jobsv1_pb2.ManagedJobInfo(
456
+ # The `spot.job_id`, which can be used to identify
457
+ # different tasks for the same job
458
+ _job_id=job.get('_job_id'),
442
459
  job_id=job.get('job_id'),
443
460
  task_id=job.get('task_id'),
444
461
  job_name=job.get('job_name'),
@@ -466,11 +483,7 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
466
483
  end_at=job.get('end_at'),
467
484
  user_yaml=job.get('user_yaml'),
468
485
  entrypoint=job.get('entrypoint'),
469
- metadata={
470
- k: v
471
- for k, v in job.get('metadata', {}).items()
472
- if v is not None
473
- },
486
+ metadata=converted_metadata,
474
487
  pool=job.get('pool'),
475
488
  pool_hash=job.get('pool_hash'))
476
489
  jobs_info.append(job_info)
@@ -4,11 +4,16 @@ processes of proc_pid.
4
4
  """
5
5
  import argparse
6
6
  import os
7
+ import signal
7
8
  import sys
8
9
  import time
10
+ from typing import List, Optional
9
11
 
10
12
  import psutil
11
13
 
14
+ # Environment variable to enable kill_pg in subprocess daemon.
15
+ USE_KILL_PG_ENV_VAR = 'SKYPILOT_SUBPROCESS_DAEMON_KILL_PG'
16
+
12
17
 
13
18
  def daemonize():
14
19
  """Detaches the process from its parent process with double-forking.
@@ -38,8 +43,74 @@ def daemonize():
38
43
  # This process is now fully detached from the original parent and terminal
39
44
 
40
45
 
41
- if __name__ == '__main__':
42
- daemonize()
46
+ def get_pgid_if_leader(pid) -> Optional[int]:
47
+ """Get the process group ID of the target process if it is the leader."""
48
+ try:
49
+ pgid = os.getpgid(pid)
50
+ # Only use process group if the target process is the leader. This is
51
+ # to avoid killing the entire process group while the target process is
52
+ # just a subprocess in the group.
53
+ if pgid == pid:
54
+ print(f'Process group {pgid} is the leader.')
55
+ return pgid
56
+ return None
57
+ except Exception: # pylint: disable=broad-except
58
+ # Process group is only available in UNIX.
59
+ return None
60
+
61
+
62
+ def kill_process_group(pgid: int) -> bool:
63
+ """Kill the target process group."""
64
+ try:
65
+ print(f'Terminating process group {pgid}...')
66
+ os.killpg(pgid, signal.SIGTERM)
67
+ except Exception: # pylint: disable=broad-except
68
+ return False
69
+
70
+ # Wait 30s for the process group to exit gracefully.
71
+ time.sleep(30)
72
+
73
+ try:
74
+ print(f'Force killing process group {pgid}...')
75
+ os.killpg(pgid, signal.SIGKILL)
76
+ except Exception: # pylint: disable=broad-except
77
+ pass
78
+
79
+ return True
80
+
81
+
82
+ def kill_process_tree(process: psutil.Process,
83
+ children: List[psutil.Process]) -> bool:
84
+ """Kill the process tree of the target process."""
85
+ if process is not None:
86
+ # Kill the target process first to avoid having more children, or fail
87
+ # the process due to the children being defunct.
88
+ children = [process] + children
89
+
90
+ if not children:
91
+ sys.exit()
92
+
93
+ for child in children:
94
+ try:
95
+ child.terminate()
96
+ except psutil.NoSuchProcess:
97
+ continue
98
+
99
+ # Wait 30s for the processes to exit gracefully.
100
+ time.sleep(30)
101
+
102
+ # SIGKILL if they're still running.
103
+ for child in children:
104
+ try:
105
+ child.kill()
106
+ except psutil.NoSuchProcess:
107
+ continue
108
+
109
+ return True
110
+
111
+
112
+ def main():
113
+ # daemonize()
43
114
  parser = argparse.ArgumentParser()
44
115
  parser.add_argument('--parent-pid', type=int, required=True)
45
116
  parser.add_argument('--proc-pid', type=int, required=True)
@@ -72,37 +143,40 @@ if __name__ == '__main__':
72
143
  except (psutil.NoSuchProcess, ValueError):
73
144
  pass
74
145
 
146
+ pgid: Optional[int] = None
147
+ if os.environ.get(USE_KILL_PG_ENV_VAR) == '1':
148
+ # Use kill_pg on UNIX system if allowed to reduce the resource usage.
149
+ # Note that both implementations might leave subprocessed uncancelled:
150
+ # - kill_process_tree(default): a subprocess is able to detach itself
151
+ # from the process tree use the same technique as daemonize(). Also,
152
+ # since we refresh the process tree per second, if the subprocess is
153
+ # launched between the [last_poll, parent_die] interval, the
154
+ # subprocess will not be captured will not be killed.
155
+ # - kill_process_group: kill_pg will kill all the processed in the group
156
+ # but if a subprocess calls setpgid(0, 0) to detach itself from the
157
+ # process group (usually to daemonize itself), the subprocess will
158
+ # not be killed.
159
+ pgid = get_pgid_if_leader(process.pid)
160
+
75
161
  if process is not None and parent_process is not None:
76
162
  # Wait for either parent or target process to exit
77
163
  while process.is_running() and parent_process.is_running():
78
- try:
79
- tmp_children = process.children(recursive=True)
80
- if tmp_children:
81
- children = tmp_children
82
- except psutil.NoSuchProcess:
83
- pass
164
+ if pgid is None:
165
+ # Refresh process tree for cleanup if process group is not
166
+ # available.
167
+ try:
168
+ tmp_children = process.children(recursive=True)
169
+ if tmp_children:
170
+ children = tmp_children
171
+ except psutil.NoSuchProcess:
172
+ pass
84
173
  time.sleep(1)
85
174
 
86
- if process is not None:
87
- # Kill the target process first to avoid having more children, or fail
88
- # the process due to the children being defunct.
89
- children = [process] + children
175
+ if pgid is not None:
176
+ kill_process_group(pgid)
177
+ else:
178
+ kill_process_tree(process, children)
90
179
 
91
- if not children:
92
- sys.exit()
93
180
 
94
- for child in children:
95
- try:
96
- child.terminate()
97
- except psutil.NoSuchProcess:
98
- continue
99
-
100
- # Wait 30s for the processes to exit gracefully.
101
- time.sleep(30)
102
-
103
- # SIGKILL if they're still running.
104
- for child in children:
105
- try:
106
- child.kill()
107
- except psutil.NoSuchProcess:
108
- continue
181
+ if __name__ == '__main__':
182
+ main()
sky/skypilot_config.py CHANGED
@@ -64,7 +64,6 @@ from sqlalchemy import orm
64
64
  from sqlalchemy.dialects import postgresql
65
65
  from sqlalchemy.dialects import sqlite
66
66
  from sqlalchemy.ext import declarative
67
- from sqlalchemy.pool import NullPool
68
67
 
69
68
  from sky import exceptions
70
69
  from sky import sky_logging
@@ -77,6 +76,7 @@ from sky.utils import schemas
77
76
  from sky.utils import ux_utils
78
77
  from sky.utils import yaml_utils
79
78
  from sky.utils.db import db_utils
79
+ from sky.utils.db import migration_utils
80
80
  from sky.utils.kubernetes import config_map_utils
81
81
 
82
82
  if typing.TYPE_CHECKING:
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
121
121
 
122
122
  API_SERVER_CONFIG_KEY = 'api_server_config'
123
123
 
124
- _DB_USE_LOCK = threading.Lock()
124
+ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
125
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
125
126
 
126
127
  Base = declarative.declarative_base()
127
128
 
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
481
482
  reload_config()
482
483
 
483
484
 
484
- def reload_config() -> None:
485
+ def reload_config(init_db: bool = False) -> None:
485
486
  internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
486
487
  if internal_config_path is not None:
487
488
  # {ENV_VAR_SKYPILOT_CONFIG} is used internally.
@@ -493,7 +494,7 @@ def reload_config() -> None:
493
494
  return
494
495
 
495
496
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
496
- _reload_config_as_server()
497
+ _reload_config_as_server(init_db=init_db)
497
498
  else:
498
499
  _reload_config_as_client()
499
500
 
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
564
565
  _set_loaded_config_path(config_path)
565
566
 
566
567
 
567
- def _reload_config_as_server() -> None:
568
+ def _create_table(engine: sqlalchemy.engine.Engine):
569
+ """Initialize the config database with migrations."""
570
+ migration_utils.safe_alembic_upgrade(
571
+ engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
572
+ migration_utils.SKYPILOT_CONFIG_VERSION)
573
+
574
+
575
+ def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
576
+ """Initialize and return the config database engine.
577
+
578
+ This function should only be called by the API Server during initialization.
579
+ Client-side code should never call this function.
580
+ """
581
+ assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
582
+ 'initialize_and_get_db() can only be called by the API Server')
583
+
584
+ global _SQLALCHEMY_ENGINE
585
+
586
+ if _SQLALCHEMY_ENGINE is not None:
587
+ return _SQLALCHEMY_ENGINE
588
+
589
+ with _SQLALCHEMY_ENGINE_LOCK:
590
+ if _SQLALCHEMY_ENGINE is not None:
591
+ return _SQLALCHEMY_ENGINE
592
+
593
+ # We only store config in the DB when using Postgres,
594
+ # so no need to pass in db_name here.
595
+ engine = db_utils.get_engine(None)
596
+
597
+ # Run migrations if needed
598
+ _create_table(engine)
599
+
600
+ _SQLALCHEMY_ENGINE = engine
601
+ return _SQLALCHEMY_ENGINE
602
+
603
+
604
+ def _reload_config_as_server(init_db: bool = False) -> None:
568
605
  # Reset the global variables, to avoid using stale values.
569
606
  _set_loaded_config(config_utils.Config())
570
607
  _set_loaded_config_path(None)
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
580
617
  raise ValueError(
581
618
  'If db config is specified, no other config is allowed')
582
619
  logger.debug('retrieving config from database')
583
- with _DB_USE_LOCK:
584
- dispose_engine = False
585
- if db_utils.get_max_connections() == 0:
586
- dispose_engine = True
587
- sqlalchemy_engine = sqlalchemy.create_engine(db_url,
588
- poolclass=NullPool)
589
- else:
590
- sqlalchemy_engine = db_utils.get_engine('config')
591
- db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
592
- sqlalchemy_engine)
593
-
594
- def _get_config_yaml_from_db(
595
- key: str) -> Optional[config_utils.Config]:
596
- assert sqlalchemy_engine is not None
597
- with orm.Session(sqlalchemy_engine) as session:
598
- row = session.query(config_yaml_table).filter_by(
599
- key=key).first()
600
- if row:
601
- db_config = config_utils.Config(
602
- yaml_utils.safe_load(row.value))
603
- db_config.pop_nested(('db',), None)
604
- return db_config
605
- return None
606
-
607
- db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
608
- if db_config:
609
- server_config = overlay_skypilot_config(server_config,
610
- db_config)
611
- # Close the engine to avoid connection leaks
612
- if dispose_engine:
613
- sqlalchemy_engine.dispose()
620
+
621
+ if init_db:
622
+ _initialize_and_get_db()
623
+
624
+ def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
625
+ assert _SQLALCHEMY_ENGINE is not None
626
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
627
+ row = session.query(config_yaml_table).filter_by(
628
+ key=key).first()
629
+ if row:
630
+ db_config = config_utils.Config(yaml_utils.safe_load(row.value))
631
+ db_config.pop_nested(('db',), None)
632
+ return db_config
633
+ return None
634
+
635
+ db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
636
+ if db_config:
637
+ server_config = overlay_skypilot_config(server_config, db_config)
614
638
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
615
639
  logger.debug(f'server config: \n'
616
640
  f'{yaml_utils.dump_yaml_str(dict(server_config))}')
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
666
690
 
667
691
 
668
692
  # Load on import, synchronization is guaranteed by python interpreter.
669
- reload_config()
693
+ reload_config(init_db=True)
670
694
 
671
695
 
672
696
  def loaded() -> bool:
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
880
904
  if new_db_url and new_db_url != existing_db_url:
881
905
  raise ValueError('Cannot change db url while server is running')
882
906
  if existing_db_url:
883
- with _DB_USE_LOCK:
884
- dispose_engine = False
885
- if db_utils.get_max_connections() == 0:
886
- dispose_engine = True
887
- sqlalchemy_engine = sqlalchemy.create_engine(
888
- existing_db_url, poolclass=NullPool)
889
- else:
890
- sqlalchemy_engine = db_utils.get_engine('config')
891
- db_utils.add_all_tables_to_db_sqlalchemy(
892
- Base.metadata, sqlalchemy_engine)
893
-
894
- def _set_config_yaml_to_db(key: str,
895
- config: config_utils.Config):
896
- assert sqlalchemy_engine is not None
897
- config_str = yaml_utils.dump_yaml_str(dict(config))
898
- with orm.Session(sqlalchemy_engine) as session:
899
- if (sqlalchemy_engine.dialect.name ==
900
- db_utils.SQLAlchemyDialect.SQLITE.value):
901
- insert_func = sqlite.insert
902
- elif (sqlalchemy_engine.dialect.name ==
903
- db_utils.SQLAlchemyDialect.POSTGRESQL.value):
904
- insert_func = postgresql.insert
905
- else:
906
- raise ValueError('Unsupported database dialect')
907
- insert_stmnt = insert_func(config_yaml_table).values(
908
- key=key, value=config_str)
909
- do_update_stmt = insert_stmnt.on_conflict_do_update(
910
- index_elements=[config_yaml_table.c.key],
911
- set_={config_yaml_table.c.value: config_str})
912
- session.execute(do_update_stmt)
913
- session.commit()
914
-
915
- logger.debug('saving api_server config to db')
916
- _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
917
- db_updated = True
918
- # Close the engine to avoid connection leaks
919
- if dispose_engine:
920
- sqlalchemy_engine.dispose()
907
+
908
+ def _set_config_yaml_to_db(key: str, config: config_utils.Config):
909
+ # reload_config(init_db=True) is called when this module is
910
+ # imported, so the database engine must already be initialized.
911
+ assert _SQLALCHEMY_ENGINE is not None
912
+ config_str = yaml_utils.dump_yaml_str(dict(config))
913
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
914
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
915
+ db_utils.SQLAlchemyDialect.SQLITE.value):
916
+ insert_func = sqlite.insert
917
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
918
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
919
+ insert_func = postgresql.insert
920
+ else:
921
+ raise ValueError('Unsupported database dialect')
922
+ insert_stmnt = insert_func(config_yaml_table).values(
923
+ key=key, value=config_str)
924
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
925
+ index_elements=[config_yaml_table.c.key],
926
+ set_={config_yaml_table.c.value: config_str})
927
+ session.execute(do_update_stmt)
928
+ session.commit()
929
+
930
+ logger.debug('saving api_server config to db')
931
+ _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
932
+ db_updated = True
921
933
 
922
934
  if not db_updated:
923
935
  # save to the local file (PVC in Kubernetes, local file otherwise)
@@ -7,6 +7,7 @@ import fastapi
7
7
  from sky import core as sky_core
8
8
  from sky.server.requests import executor
9
9
  from sky.server.requests import payloads
10
+ from sky.server.requests import request_names
10
11
  from sky.server.requests import requests as requests_lib
11
12
  from sky.ssh_node_pools import core as ssh_node_pools_core
12
13
  from sky.utils import common_utils
@@ -99,9 +100,9 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
99
100
  """Deploy SSH Node Pool using existing ssh_up functionality."""
100
101
  try:
101
102
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
102
- executor.schedule_request(
103
+ await executor.schedule_request_async(
103
104
  request_id=request.state.request_id,
104
- request_name='ssh_up',
105
+ request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
105
106
  request_body=ssh_up_body,
106
107
  func=sky_core.ssh_up,
107
108
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -124,9 +125,9 @@ async def deploy_ssh_node_pool_general(
124
125
  ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
125
126
  """Deploys all SSH Node Pools."""
126
127
  try:
127
- executor.schedule_request(
128
+ await executor.schedule_request_async(
128
129
  request_id=request.state.request_id,
129
- request_name='ssh_up',
130
+ request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
130
131
  request_body=ssh_up_body,
131
132
  func=sky_core.ssh_up,
132
133
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -150,9 +151,9 @@ async def down_ssh_node_pool(request: fastapi.Request,
150
151
  """Cleans up a SSH Node Pools."""
151
152
  try:
152
153
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
153
- executor.schedule_request(
154
+ await executor.schedule_request_async(
154
155
  request_id=request.state.request_id,
155
- request_name='ssh_down',
156
+ request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
156
157
  request_body=ssh_up_body,
157
158
  func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
158
159
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -178,9 +179,9 @@ async def down_ssh_node_pool_general(
178
179
  try:
179
180
  # Set cleanup=True for down operation
180
181
  ssh_up_body.cleanup = True
181
- executor.schedule_request(
182
+ await executor.schedule_request_async(
182
183
  request_id=request.state.request_id,
183
- request_name='ssh_down',
184
+ request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
184
185
  request_body=ssh_up_body,
185
186
  func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
186
187
  schedule_type=requests_lib.ScheduleType.LONG,