skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend_utils.py +74 -7
  4. sky/backends/cloud_vm_ray_backend.py +169 -29
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +62 -85
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +69 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +15 -5
  14. sky/clouds/nebius.py +3 -1
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  23. sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  25. sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
  27. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  29. sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  34. sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
  36. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  37. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
  39. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
  42. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
  54. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  55. sky/dashboard/out/clusters/[cluster].html +1 -1
  56. sky/dashboard/out/clusters.html +1 -1
  57. sky/dashboard/out/config.html +1 -1
  58. sky/dashboard/out/index.html +1 -1
  59. sky/dashboard/out/infra/[context].html +1 -1
  60. sky/dashboard/out/infra.html +1 -1
  61. sky/dashboard/out/jobs/[job].html +1 -1
  62. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -1
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage.py +11 -1
  70. sky/exceptions.py +5 -0
  71. sky/execution.py +13 -10
  72. sky/global_user_state.py +191 -8
  73. sky/jobs/constants.py +1 -1
  74. sky/jobs/controller.py +0 -1
  75. sky/jobs/recovery_strategy.py +3 -3
  76. sky/jobs/scheduler.py +35 -87
  77. sky/jobs/server/core.py +82 -22
  78. sky/jobs/server/utils.py +1 -1
  79. sky/jobs/state.py +7 -5
  80. sky/jobs/utils.py +167 -8
  81. sky/provision/__init__.py +1 -0
  82. sky/provision/aws/config.py +25 -0
  83. sky/provision/aws/instance.py +37 -13
  84. sky/provision/azure/instance.py +2 -0
  85. sky/provision/cudo/cudo_wrapper.py +1 -1
  86. sky/provision/cudo/instance.py +2 -0
  87. sky/provision/do/instance.py +2 -0
  88. sky/provision/fluidstack/instance.py +2 -0
  89. sky/provision/gcp/instance.py +2 -0
  90. sky/provision/hyperbolic/instance.py +2 -1
  91. sky/provision/kubernetes/instance.py +133 -0
  92. sky/provision/lambda_cloud/instance.py +2 -0
  93. sky/provision/nebius/instance.py +2 -0
  94. sky/provision/nebius/utils.py +101 -86
  95. sky/provision/oci/instance.py +2 -0
  96. sky/provision/paperspace/instance.py +2 -1
  97. sky/provision/paperspace/utils.py +1 -1
  98. sky/provision/provisioner.py +13 -8
  99. sky/provision/runpod/instance.py +2 -0
  100. sky/provision/runpod/utils.py +1 -1
  101. sky/provision/scp/instance.py +2 -0
  102. sky/provision/vast/instance.py +2 -0
  103. sky/provision/vsphere/instance.py +2 -0
  104. sky/resources.py +6 -7
  105. sky/schemas/__init__.py +0 -0
  106. sky/schemas/api/__init__.py +0 -0
  107. sky/schemas/api/responses.py +70 -0
  108. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  109. sky/schemas/generated/__init__.py +0 -0
  110. sky/schemas/generated/autostopv1_pb2.py +36 -0
  111. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  112. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  113. sky/serve/constants.py +3 -7
  114. sky/serve/replica_managers.py +138 -117
  115. sky/serve/serve_state.py +42 -0
  116. sky/serve/serve_utils.py +58 -36
  117. sky/serve/server/impl.py +15 -19
  118. sky/serve/service.py +82 -33
  119. sky/server/constants.py +1 -1
  120. sky/server/requests/payloads.py +6 -0
  121. sky/server/requests/serializers/decoders.py +12 -2
  122. sky/server/requests/serializers/encoders.py +10 -2
  123. sky/server/server.py +64 -16
  124. sky/setup_files/dependencies.py +11 -10
  125. sky/skylet/autostop_lib.py +38 -5
  126. sky/skylet/constants.py +3 -1
  127. sky/skylet/services.py +44 -0
  128. sky/skylet/skylet.py +49 -4
  129. sky/task.py +19 -16
  130. sky/templates/aws-ray.yml.j2 +2 -2
  131. sky/templates/jobs-controller.yaml.j2 +6 -0
  132. sky/templates/kubernetes-ray.yml.j2 +1 -0
  133. sky/utils/command_runner.py +1 -1
  134. sky/utils/common_utils.py +20 -0
  135. sky/utils/config_utils.py +29 -5
  136. sky/utils/controller_utils.py +86 -0
  137. sky/utils/db/db_utils.py +17 -0
  138. sky/utils/db/migration_utils.py +1 -1
  139. sky/utils/log_utils.py +14 -5
  140. sky/utils/resources_utils.py +25 -1
  141. sky/utils/schemas.py +6 -0
  142. sky/utils/ux_utils.py +36 -5
  143. sky/volumes/server/core.py +2 -2
  144. sky/volumes/server/server.py +2 -2
  145. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
  146. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
  147. sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  149. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  150. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  151. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  155. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  156. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  158. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  160. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  161. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  163. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  164. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  166. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  169. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  170. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
  175. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
  176. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
  177. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
  178. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
  179. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
@@ -88,17 +88,18 @@ local_ray = [
88
88
  'ray[default] >= 2.2.0, != 2.6.0',
89
89
  ]
90
90
 
91
+ # See requirements-dev.txt for the version of grpc and protobuf
92
+ # used to generate the code during development.
91
93
  remote = [
92
- # Adopted from ray's setup.py:
93
- # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L251-L252
94
- # SkyPilot: != 1.48.0 is required to avoid the error where ray dashboard
95
- # fails to start when ray start is called (#2054).
96
- # Tracking issue: https://github.com/ray-project/ray/issues/30984
97
- 'grpcio >= 1.32.0, != 1.48.0; python_version < \'3.10\'',
98
- 'grpcio >= 1.42.0, != 1.48.0; python_version >= \'3.10\'',
99
- # Adopted from ray's setup.py:
100
- # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L343
101
- 'protobuf >= 3.15.3, != 3.19.5',
94
+ # The grpc version at runtime has to be newer than the version
95
+ # used to generate the code.
96
+ 'grpcio>=1.63.0',
97
+ # >= 5.26.1 because the runtime version can't be older than the version
98
+ # used to generate the code.
99
+ # < 7.0.0 because code generated for a major version V will be supported by
100
+ # protobuf runtimes of version V and V+1.
101
+ # https://protobuf.dev/support/cross-version-runtime-guarantee
102
+ 'protobuf >= 5.26.1, < 7.0.0',
102
103
  ]
103
104
 
104
105
  # NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
@@ -16,8 +16,13 @@ from sky.utils import ux_utils
16
16
 
17
17
  if typing.TYPE_CHECKING:
18
18
  import psutil
19
+
20
+ from sky.schemas.generated import autostopv1_pb2
19
21
  else:
20
22
  psutil = adaptors_common.LazyImport('psutil')
23
+ # To avoid requiring protobuf to be installed on the client side.
24
+ autostopv1_pb2 = adaptors_common.LazyImport(
25
+ 'sky.schemas.generated.autostopv1_pb2')
21
26
 
22
27
  logger = sky_logging.init_logger(__name__)
23
28
 
@@ -55,11 +60,9 @@ Determines the condition for resetting the idleness timer.
55
60
  This option works in conjunction with ``--{pair}``. Options:
56
61
 
57
62
  \b
58
- 1. ``jobs_and_ssh`` (default): Wait for all jobs to complete AND all SSH
59
- sessions to disconnect.
60
- 2. ``jobs``: Wait for all jobs to complete.
61
- 3. ``none``: Stop immediately after idle time expires, regardless of running
62
- jobs or SSH connections."""
63
+ 1. ``jobs_and_ssh`` (default): Wait for in-progress jobs and SSH connections to finish.
64
+ 2. ``jobs``: Only wait for in-progress jobs.
65
+ 3. ``none``: Wait for nothing; autostop right after ``{pair}``."""
63
66
 
64
67
  @classmethod
65
68
  def from_str(cls, mode: str) -> 'AutostopWaitFor':
@@ -78,6 +81,36 @@ jobs or SSH connections."""
78
81
  f'\'{cls.JOBS.value}\', or '
79
82
  f'\'{cls.NONE.value}\'. ')
80
83
 
84
+ @classmethod
85
+ def from_protobuf(
86
+ cls, protobuf_value: 'autostopv1_pb2.AutostopWaitFor'
87
+ ) -> Optional['AutostopWaitFor']:
88
+ """Convert protobuf AutostopWaitFor enum to Python enum value."""
89
+ protobuf_to_enum = {
90
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: cls.JOBS_AND_SSH,
91
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS: cls.JOBS,
92
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE: cls.NONE,
93
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED: None,
94
+ }
95
+ if protobuf_value not in protobuf_to_enum:
96
+ with ux_utils.print_exception_no_traceback():
97
+ raise ValueError(
98
+ f'Unknown protobuf AutostopWaitFor value: {protobuf_value}')
99
+ return protobuf_to_enum[protobuf_value]
100
+
101
+ def to_protobuf(self) -> 'autostopv1_pb2.AutostopWaitFor':
102
+ """Convert this Python enum value to protobuf enum value."""
103
+ enum_to_protobuf = {
104
+ AutostopWaitFor.JOBS_AND_SSH:
105
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
106
+ AutostopWaitFor.JOBS: autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS,
107
+ AutostopWaitFor.NONE: autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE,
108
+ }
109
+ if self not in enum_to_protobuf:
110
+ with ux_utils.print_exception_no_traceback():
111
+ raise ValueError(f'Unknown AutostopWaitFor value: {self}')
112
+ return enum_to_protobuf[self]
113
+
81
114
 
82
115
  DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
83
116
 
sky/skylet/constants.py CHANGED
@@ -90,12 +90,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
90
90
  # cluster yaml is updated.
91
91
  #
92
92
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
93
- SKYLET_VERSION = '16'
93
+ SKYLET_VERSION = '17'
94
94
  # The version of the lib files that skylet/jobs use. Whenever there is an API
95
95
  # change for the job_lib or log_lib, we need to bump this version, so that the
96
96
  # user can be notified to update their SkyPilot version on the remote cluster.
97
97
  SKYLET_LIB_VERSION = 4
98
98
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
99
+ SKYLET_GRPC_PORT = 46590
100
+ SKYLET_GRPC_TIMEOUT_SECONDS = 5
99
101
 
100
102
  # Docker default options
101
103
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
sky/skylet/services.py ADDED
@@ -0,0 +1,44 @@
1
+ """gRPC service implementations for skylet."""
2
+
3
+ import grpc
4
+
5
+ from sky import sky_logging
6
+ from sky.schemas.generated import autostopv1_pb2
7
+ from sky.schemas.generated import autostopv1_pb2_grpc
8
+ from sky.skylet import autostop_lib
9
+
10
+ logger = sky_logging.init_logger(__name__)
11
+
12
+
13
+ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
14
+ """Implementation of the AutostopService gRPC service."""
15
+
16
+ def SetAutostop( # type: ignore[return]
17
+ self, request: autostopv1_pb2.SetAutostopRequest,
18
+ context: grpc.ServicerContext
19
+ ) -> autostopv1_pb2.SetAutostopResponse:
20
+ """Sets autostop configuration for the cluster."""
21
+ try:
22
+ wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
23
+ request.wait_for)
24
+ autostop_lib.set_autostop(
25
+ idle_minutes=request.idle_minutes,
26
+ backend=request.backend,
27
+ wait_for=wait_for if wait_for is not None else
28
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
29
+ down=request.down)
30
+ return autostopv1_pb2.SetAutostopResponse()
31
+ except Exception as e: # pylint: disable=broad-except
32
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
33
+
34
+ def IsAutostopping( # type: ignore[return]
35
+ self, request: autostopv1_pb2.IsAutostoppingRequest,
36
+ context: grpc.ServicerContext
37
+ ) -> autostopv1_pb2.IsAutostoppingResponse:
38
+ """Checks if the cluster is currently autostopping."""
39
+ try:
40
+ is_autostopping = autostop_lib.get_is_autostopping()
41
+ return autostopv1_pb2.IsAutostoppingResponse(
42
+ is_autostopping=is_autostopping)
43
+ except Exception as e: # pylint: disable=broad-except
44
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
sky/skylet/skylet.py CHANGED
@@ -1,11 +1,17 @@
1
1
  """skylet: a daemon running on the head node of a cluster."""
2
2
 
3
+ import concurrent.futures
4
+ import os
3
5
  import time
4
6
 
7
+ import grpc
8
+
5
9
  import sky
6
10
  from sky import sky_logging
11
+ from sky.schemas.generated import autostopv1_pb2_grpc
7
12
  from sky.skylet import constants
8
13
  from sky.skylet import events
14
+ from sky.skylet import services
9
15
 
10
16
  # Use the explicit logger name so that the logger is under the
11
17
  # `sky.skylet.skylet` namespace when executed directly, so as
@@ -31,7 +37,46 @@ EVENTS = [
31
37
  events.UsageHeartbeatReportEvent(),
32
38
  ]
33
39
 
34
- while True:
35
- time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
36
- for event in EVENTS:
37
- event.run()
40
+
41
+ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
42
+ """Start the gRPC server."""
43
+ # This is the default value in Python 3.8 - 3.12,
44
+ # putting it here for visibility.
45
+ # TODO(kevin): Determine the optimal max number of threads.
46
+ max_workers = min(32, (os.cpu_count() or 1) + 4)
47
+ server = grpc.server(
48
+ concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
49
+
50
+ autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
51
+ services.AutostopServiceImpl(), server)
52
+
53
+ listen_addr = f'127.0.0.1:{port}'
54
+ server.add_insecure_port(listen_addr)
55
+
56
+ server.start()
57
+ logger.info(f'gRPC server started on {listen_addr}')
58
+
59
+ return server
60
+
61
+
62
+ def run_event_loop():
63
+ """Run the existing event loop."""
64
+
65
+ while True:
66
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
67
+ for event in EVENTS:
68
+ event.run()
69
+
70
+
71
+ def main():
72
+ grpc_server = start_grpc_server()
73
+ try:
74
+ run_event_loop()
75
+ except KeyboardInterrupt:
76
+ logger.info('Shutting down skylet...')
77
+ finally:
78
+ grpc_server.stop(grace=5)
79
+
80
+
81
+ if __name__ == '__main__':
82
+ main()
sky/task.py CHANGED
@@ -10,26 +10,25 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
10
10
 
11
11
  import colorama
12
12
 
13
- import sky
14
13
  from sky import clouds
14
+ from sky import dag as dag_lib
15
15
  from sky import exceptions
16
+ from sky import resources as resources_lib
16
17
  from sky import sky_logging
17
18
  from sky.adaptors import common as adaptors_common
18
- import sky.dag
19
19
  from sky.data import data_utils
20
20
  from sky.data import storage as storage_lib
21
21
  from sky.provision import docker_utils
22
22
  from sky.serve import service_spec
23
23
  from sky.skylet import constants
24
24
  from sky.utils import common_utils
25
+ from sky.utils import registry
25
26
  from sky.utils import schemas
26
27
  from sky.utils import ux_utils
27
28
  from sky.utils import volume as volume_lib
28
29
 
29
30
  if typing.TYPE_CHECKING:
30
31
  import yaml
31
-
32
- from sky import resources as resources_lib
33
32
  else:
34
33
  yaml = adaptors_common.LazyImport('yaml')
35
34
 
@@ -382,26 +381,28 @@ class Task:
382
381
  self.estimated_inputs_size_gigabytes: Optional[float] = None
383
382
  self.estimated_outputs_size_gigabytes: Optional[float] = None
384
383
  # Default to CPU VM
385
- self.resources: Union[List[sky.Resources],
386
- Set[sky.Resources]] = {sky.Resources()}
384
+ self.resources: Union[List['resources_lib.Resources'],
385
+ Set['resources_lib.Resources']] = {
386
+ resources_lib.Resources()
387
+ }
387
388
  self._service: Optional[service_spec.SkyServiceSpec] = None
388
389
 
389
390
  # Resources that this task cannot run on.
390
391
  self.blocked_resources = blocked_resources
391
392
 
392
- self.time_estimator_func: Optional[Callable[['sky.Resources'],
393
+ self.time_estimator_func: Optional[Callable[['resources_lib.Resources'],
393
394
  int]] = None
394
395
  self.file_mounts: Optional[Dict[str, str]] = None
395
396
 
396
397
  # Only set when 'self' is a jobs controller task: 'self.managed_job_dag'
397
398
  # is the underlying managed job dag (sky.Dag object).
398
- self.managed_job_dag: Optional['sky.Dag'] = None
399
+ self.managed_job_dag: Optional['dag_lib.Dag'] = None
399
400
 
400
401
  # Only set when 'self' is a sky serve controller task.
401
402
  self.service_name: Optional[str] = None
402
403
 
403
404
  # Filled in by the optimizer. If None, this Task is not planned.
404
- self.best_resources: Optional[sky.Resources] = None
405
+ self.best_resources: Optional['resources_lib.Resources'] = None
405
406
 
406
407
  # For internal use only.
407
408
  self.file_mounts_mapping: Optional[Dict[str,
@@ -418,7 +419,7 @@ class Task:
418
419
  if file_mounts is not None:
419
420
  self.set_file_mounts(file_mounts)
420
421
 
421
- dag = sky.dag.get_current_dag()
422
+ dag = dag_lib.get_current_dag()
422
423
  if dag is not None:
423
424
  dag.add(self)
424
425
 
@@ -783,7 +784,8 @@ class Task:
783
784
  '_cluster_config_overrides'] = cluster_config_override
784
785
  if volumes:
785
786
  resources_config['volumes'] = volumes
786
- task.set_resources(sky.Resources.from_yaml_config(resources_config))
787
+ task.set_resources(
788
+ resources_lib.Resources.from_yaml_config(resources_config))
787
789
 
788
790
  service = config.pop('service', None)
789
791
  pool = config.pop('pool', None)
@@ -931,7 +933,8 @@ class Task:
931
933
  for key, (vol_name, vol_req) in topology.items():
932
934
  if vol_req is not None:
933
935
  if key == 'cloud':
934
- override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
936
+ override_params[key] = registry.CLOUD_REGISTRY.from_str(
937
+ vol_req)
935
938
  else:
936
939
  override_params[key] = vol_req
937
940
  self.set_resources_override(override_params)
@@ -1142,7 +1145,7 @@ class Task:
1142
1145
  Returns:
1143
1146
  self: The current task, with resources set.
1144
1147
  """
1145
- if isinstance(resources, sky.Resources):
1148
+ if isinstance(resources, resources_lib.Resources):
1146
1149
  resources = {resources}
1147
1150
  # TODO(woosuk): Check if the resources are None.
1148
1151
  self.resources = _with_docker_login_config(resources, self.envs,
@@ -1187,8 +1190,8 @@ class Task:
1187
1190
  self._service = service
1188
1191
  return self
1189
1192
 
1190
- def set_time_estimator(self, func: Callable[['sky.Resources'],
1191
- int]) -> 'Task':
1193
+ def set_time_estimator(
1194
+ self, func: Callable[['resources_lib.Resources'], int]) -> 'Task':
1192
1195
  """Sets a func mapping resources to estimated time (secs).
1193
1196
 
1194
1197
  This is EXPERIMENTAL.
@@ -1712,7 +1715,7 @@ class Task:
1712
1715
  return required_features
1713
1716
 
1714
1717
  def __rshift__(self, b):
1715
- sky.dag.get_current_dag().add_edge(self, b)
1718
+ dag_lib.get_current_dag().add_edge(self, b)
1716
1719
 
1717
1720
  def __repr__(self):
1718
1721
  if isinstance(self.run, str):
@@ -50,7 +50,7 @@ provider:
50
50
  disable_launch_config_check: true
51
51
 
52
52
  auth:
53
- ssh_user: ubuntu
53
+ ssh_user: {{ssh_user}}
54
54
  ssh_private_key: {{ssh_private_key}}
55
55
  {% if ssh_proxy_command is not none %}
56
56
  ssh_proxy_command: {{ssh_proxy_command}}
@@ -68,7 +68,7 @@ available_node_types:
68
68
  ImageId: {{image_id}} # Deep Learning AMI (Ubuntu 18.04); see aws.py.
69
69
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
70
70
  BlockDeviceMappings:
71
- - DeviceName: /dev/sda1
71
+ - DeviceName: {{root_device_name}}
72
72
  Ebs:
73
73
  VolumeSize: {{disk_size}}
74
74
  VolumeType: {{disk_tier}}
@@ -15,6 +15,12 @@ file_mounts:
15
15
  {{controller_file_mount_path}}: {{local_file_mount_path}}
16
16
  {%- endfor %}
17
17
 
18
+ # NOTE(dev): This needs to be a subset of sky/templates/sky-serve-controller.yaml.j2.
19
+ # It is because we use the --fast flag to submit jobs and no --fast flag to launch pools.
20
+ # So when we launch a new pool, it will install the required dependencies.
21
+ # TODO(tian): Add --fast to launch pools as well, and figure out the dependency installation.
22
+ # Maybe in the --fast implementation, we can store the hash of setup commands that used to be
23
+ # run and don't skip setup phase if the hash is different.
18
24
  setup: |
19
25
  {{ sky_activate_python_env }}
20
26
  # Disable the pip version check to avoid the warning message, which makes the
@@ -378,6 +378,7 @@ available_node_types:
378
378
  {% if volume_mounts %}
379
379
  securityContext:
380
380
  fsGroup: 1000
381
+ fsGroupChangePolicy: OnRootMismatch
381
382
  {% endif %}
382
383
 
383
384
  # Add node selector if GPU/TPUs are requested:
@@ -674,7 +674,7 @@ class SSHCommandRunner(CommandRunner):
674
674
  ssh += ['-tt']
675
675
  if port_forward is not None:
676
676
  for local, remote in port_forward:
677
- logger.info(
677
+ logger.debug(
678
678
  f'Forwarding local port {local} to remote port {remote}.')
679
679
  ssh += ['-NL', f'{local}:localhost:{remote}']
680
680
  if self._docker_ssh_proxy_command is not None:
sky/utils/common_utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Utils shared between all of sky"""
2
2
 
3
3
  import difflib
4
+ import enum
4
5
  import functools
5
6
  import getpass
6
7
  import hashlib
@@ -55,6 +56,25 @@ _VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
55
56
  logger = sky_logging.init_logger(__name__)
56
57
 
57
58
 
59
+ class ProcessStatus(enum.Enum):
60
+ """Process status."""
61
+
62
+ # The process is scheduled to run, but not started yet.
63
+ SCHEDULED = 'SCHEDULED'
64
+
65
+ # The process is running
66
+ RUNNING = 'RUNNING'
67
+
68
+ # The process is finished and succeeded
69
+ SUCCEEDED = 'SUCCEEDED'
70
+
71
+ # The process is interrupted
72
+ INTERRUPTED = 'INTERRUPTED'
73
+
74
+ # The process failed
75
+ FAILED = 'FAILED'
76
+
77
+
58
78
  @annotations.lru_cache(scope='request')
59
79
  def get_usage_run_id() -> str:
60
80
  """Returns a unique run id for each 'run'.
sky/utils/config_utils.py CHANGED
@@ -8,6 +8,26 @@ logger = sky_logging.init_logger(__name__)
8
8
 
9
9
  _REGION_CONFIG_CLOUDS = ['nebius', 'oci']
10
10
 
11
+ # Kubernetes API use list to represent dictionary fields with patch strategy
12
+ # merge and each item is indexed by the patch merge key. The following map
13
+ # maps the field name to the patch merge key.
14
+ # pylint: disable=line-too-long
15
+ # Ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#podspec-v1-core
16
+ # NOTE: field containers and imagePullSecrets are not included deliberately for
17
+ # backward compatibility (we only support one container per pod now).
18
+ _PATCH_MERGE_KEYS = {
19
+ 'initContainers': 'name',
20
+ 'ephemeralContainers': 'name',
21
+ 'volumes': 'name',
22
+ 'volumeMounts': 'name',
23
+ 'resourceClaims': 'name',
24
+ 'env': 'name',
25
+ 'hostAliases': 'ip',
26
+ 'topologySpreadConstraints': 'topologyKey',
27
+ 'ports': 'containerPort',
28
+ 'volumeDevices': 'devicePath',
29
+ }
30
+
11
31
 
12
32
  class Config(Dict[str, Any]):
13
33
  """SkyPilot config that supports setting/getting values with nested keys."""
@@ -211,19 +231,23 @@ def merge_k8s_configs(
211
231
  merge_k8s_configs(base_config[key][0], value[0],
212
232
  next_allowed_override_keys,
213
233
  next_disallowed_override_keys)
214
- elif key in ['volumes', 'volumeMounts', 'initContainers']:
215
- # If the key is 'volumes', 'volumeMounts', or 'initContainers',
216
- # we search for item with the same name and merge it.
234
+ # For list fields with patch strategy "merge", we merge the list
235
+ # by the patch merge key.
236
+ elif key in _PATCH_MERGE_KEYS:
237
+ patch_merge_key = _PATCH_MERGE_KEYS[key]
217
238
  for override_item in value:
218
- override_item_name = override_item.get('name')
239
+ override_item_name = override_item.get(patch_merge_key)
219
240
  if override_item_name is not None:
220
241
  existing_base_item = next(
221
242
  (v for v in base_config[key]
222
- if v.get('name') == override_item_name), None)
243
+ if v.get(patch_merge_key) == override_item_name),
244
+ None)
223
245
  if existing_base_item is not None:
224
246
  merge_k8s_configs(existing_base_item, override_item)
225
247
  else:
226
248
  base_config[key].append(override_item)
249
+ else:
250
+ base_config[key].append(override_item)
227
251
  else:
228
252
  base_config[key].extend(value)
229
253
  else:
@@ -23,11 +23,14 @@ from sky.clouds import gcp
23
23
  from sky.data import data_utils
24
24
  from sky.data import storage as storage_lib
25
25
  from sky.jobs import constants as managed_job_constants
26
+ from sky.jobs import state as managed_job_state
26
27
  from sky.provision.kubernetes import constants as kubernetes_constants
27
28
  from sky.serve import constants as serve_constants
29
+ from sky.serve import serve_state
28
30
  from sky.setup_files import dependencies
29
31
  from sky.skylet import constants
30
32
  from sky.skylet import log_lib
33
+ from sky.utils import annotations
31
34
  from sky.utils import common
32
35
  from sky.utils import common_utils
33
36
  from sky.utils import config_utils
@@ -37,8 +40,13 @@ from sky.utils import rich_utils
37
40
  from sky.utils import ux_utils
38
41
 
39
42
  if typing.TYPE_CHECKING:
43
+ import psutil
44
+
40
45
  from sky import task as task_lib
41
46
  from sky.backends import cloud_vm_ray_backend
47
+ else:
48
+ from sky.adaptors import common as adaptors_common
49
+ psutil = adaptors_common.LazyImport('psutil')
42
50
 
43
51
  logger = sky_logging.init_logger(__name__)
44
52
 
@@ -1161,3 +1169,81 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
1161
1169
  task.update_storage_mounts(updated_mount_storages)
1162
1170
  if msg:
1163
1171
  logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
1172
+
1173
+
1174
+ # ======================= Resources Management Functions =======================
1175
+
1176
+ # Based on testing, assume a running job process uses 350MB memory. We use the
1177
+ # same estimation for service controller process.
1178
+ JOB_MEMORY_MB = 350
1179
+ # Monitoring process for service is 1GB. This is based on an old estimation but
1180
+ # we keep it here for now.
1181
+ # TODO(tian): Remeasure this.
1182
+ SERVE_MONITORING_MEMORY_MB = 1024
1183
+ # The ratio of service controller process to job process. We will treat each
1184
+ # service as SERVE_PROC_RATIO job processes.
1185
+ SERVE_PROC_RATIO = SERVE_MONITORING_MEMORY_MB / JOB_MEMORY_MB
1186
+ # Past 2000 simultaneous jobs, we become unstable.
1187
+ # See https://github.com/skypilot-org/skypilot/issues/4649.
1188
+ MAX_JOB_LIMIT = 2000
1189
+ # Number of ongoing launches launches allowed per CPU, for managed jobs.
1190
+ JOB_LAUNCHES_PER_CPU = 4
1191
+ # Number of ongoing launches launches allowed per CPU, for services. This is
1192
+ # also based on an old estimation, but SKyServe indeed spawn a new process
1193
+ # for each launch operation, so it should be slightly more resources demanding
1194
+ # than managed jobs.
1195
+ SERVE_LAUNCHES_PER_CPU = 2
1196
+ # The ratio of service launch to job launch. This is inverted as the parallelism
1197
+ # is determined by 1 / LAUNCHES_PER_CPU.
1198
+ SERVE_LAUNCH_RATIO = JOB_LAUNCHES_PER_CPU / SERVE_LAUNCHES_PER_CPU
1199
+
1200
+ # The _RESOURCES_LOCK should be held whenever we are checking the parallelism
1201
+ # control or updating the schedule_state of any job or service. Any code that
1202
+ # takes this lock must conclude by calling maybe_schedule_next_jobs.
1203
+ _RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
1204
+
1205
+
1206
+ @annotations.lru_cache(scope='global', maxsize=1)
1207
+ def get_resources_lock_path() -> str:
1208
+ path = os.path.expanduser(_RESOURCES_LOCK)
1209
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1210
+ return path
1211
+
1212
+
1213
+ @annotations.lru_cache(scope='request')
1214
+ def _get_job_parallelism() -> int:
1215
+ job_memory = JOB_MEMORY_MB * 1024 * 1024
1216
+ job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
1217
+ return max(job_limit, 1)
1218
+
1219
+
1220
+ @annotations.lru_cache(scope='request')
1221
+ def _get_launch_parallelism() -> int:
1222
+ cpus = os.cpu_count()
1223
+ return cpus * JOB_LAUNCHES_PER_CPU if cpus is not None else 1
1224
+
1225
+
1226
+ def can_provision() -> bool:
1227
+ # We always prioritize terminating over provisioning, to save the cost on
1228
+ # idle resources.
1229
+ if serve_state.total_number_scheduled_to_terminate_replicas() > 0:
1230
+ return False
1231
+ return can_terminate()
1232
+
1233
+
1234
+ def can_start_new_process() -> bool:
1235
+ num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
1236
+ managed_job_state.get_num_alive_jobs())
1237
+ return num_procs < _get_job_parallelism()
1238
+
1239
+
1240
+ # We limit the number of terminating replicas to the number of CPUs. This is
1241
+ # just a temporary solution to avoid overwhelming the controller. After one job
1242
+ # controller PR, we should use API server to handle resources management.
1243
+ def can_terminate() -> bool:
1244
+ num_terminating = (
1245
+ serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
1246
+ # Each terminate process will take roughly the same CPUs as job launch.
1247
+ serve_state.total_number_terminating_replicas() +
1248
+ managed_job_state.get_num_launching_jobs())
1249
+ return num_terminating < _get_launch_parallelism()
sky/utils/db/db_utils.py CHANGED
@@ -32,6 +32,23 @@ if typing.TYPE_CHECKING:
32
32
  _DB_TIMEOUT_S = 60
33
33
 
34
34
 
35
+ class UniqueConstraintViolationError(Exception):
36
+ """Exception raised for unique constraint violation.
37
+ Attributes:
38
+ value -- the input value that caused the error
39
+ message -- explanation of the error
40
+ """
41
+
42
+ def __init__(self, value, message='Unique constraint violation'):
43
+ self.value = value
44
+ self.message = message
45
+ super().__init__(self.message)
46
+
47
+ def __str__(self):
48
+ return (f'UniqueConstraintViolationError: {self.message} '
49
+ f'(Value: {self.value})')
50
+
51
+
35
52
  class SQLAlchemyDialect(enum.Enum):
36
53
  SQLITE = 'sqlite'
37
54
  POSTGRESQL = 'postgresql'
@@ -19,7 +19,7 @@ logger = sky_logging.init_logger(__name__)
19
19
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
20
20
 
21
21
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
22
- GLOBAL_USER_STATE_VERSION = '005'
22
+ GLOBAL_USER_STATE_VERSION = '006'
23
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
24
24
 
25
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'