skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -60
  9. sky/client/common.py +12 -9
  10. sky/clouds/nebius.py +1 -1
  11. sky/clouds/utils/gcp_utils.py +1 -1
  12. sky/clouds/vast.py +1 -2
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  17. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  22. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  29. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  31. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  33. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  34. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  36. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  39. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  43. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  46. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  49. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  51. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  65. sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs.html +1 -1
  75. sky/dashboard/out/users.html +1 -1
  76. sky/dashboard/out/volumes.html +1 -1
  77. sky/dashboard/out/workspace/new.html +1 -1
  78. sky/dashboard/out/workspaces/[name].html +1 -1
  79. sky/dashboard/out/workspaces.html +1 -1
  80. sky/data/data_utils.py +25 -0
  81. sky/data/storage.py +1219 -1775
  82. sky/global_user_state.py +18 -8
  83. sky/jobs/__init__.py +3 -0
  84. sky/jobs/client/sdk.py +80 -3
  85. sky/jobs/controller.py +76 -25
  86. sky/jobs/recovery_strategy.py +80 -34
  87. sky/jobs/scheduler.py +68 -20
  88. sky/jobs/server/core.py +228 -136
  89. sky/jobs/server/server.py +40 -0
  90. sky/jobs/state.py +164 -31
  91. sky/jobs/utils.py +144 -68
  92. sky/logs/aws.py +4 -2
  93. sky/provision/kubernetes/utils.py +6 -4
  94. sky/provision/nebius/constants.py +3 -0
  95. sky/provision/vast/instance.py +2 -1
  96. sky/provision/vast/utils.py +9 -6
  97. sky/py.typed +0 -0
  98. sky/resources.py +24 -14
  99. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  100. sky/serve/autoscalers.py +8 -0
  101. sky/serve/client/impl.py +188 -0
  102. sky/serve/client/sdk.py +12 -82
  103. sky/serve/constants.py +5 -1
  104. sky/serve/controller.py +5 -0
  105. sky/serve/replica_managers.py +112 -37
  106. sky/serve/serve_state.py +16 -6
  107. sky/serve/serve_utils.py +274 -77
  108. sky/serve/server/core.py +8 -525
  109. sky/serve/server/impl.py +709 -0
  110. sky/serve/service.py +13 -9
  111. sky/serve/service_spec.py +74 -4
  112. sky/server/constants.py +1 -1
  113. sky/server/requests/payloads.py +33 -0
  114. sky/server/requests/requests.py +18 -1
  115. sky/server/requests/serializers/decoders.py +12 -3
  116. sky/server/requests/serializers/encoders.py +13 -2
  117. sky/server/server.py +6 -1
  118. sky/skylet/events.py +9 -0
  119. sky/skypilot_config.py +24 -21
  120. sky/task.py +41 -11
  121. sky/templates/jobs-controller.yaml.j2 +3 -0
  122. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  123. sky/users/server.py +1 -1
  124. sky/utils/command_runner.py +4 -2
  125. sky/utils/controller_utils.py +14 -10
  126. sky/utils/dag_utils.py +4 -2
  127. sky/utils/db/migration_utils.py +2 -4
  128. sky/utils/schemas.py +24 -19
  129. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  130. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
  131. sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  134. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  135. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  140. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  146. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  147. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  149. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  151. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  153. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  156. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  158. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  160. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  163. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  166. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  168. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  169. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  170. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  175. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  179. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  180. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  182. /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  183. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  184. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  185. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  186. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,188 @@
1
+ """Implementation of SDK for SkyServe."""
2
+ import json
3
+ import typing
4
+ from typing import List, Optional, Union
5
+
6
+ import click
7
+
8
+ from sky.client import common as client_common
9
+ from sky.server import common as server_common
10
+ from sky.server.requests import payloads
11
+ from sky.utils import admin_policy_utils
12
+ from sky.utils import dag_utils
13
+
14
+ if typing.TYPE_CHECKING:
15
+ import sky
16
+ from sky.serve import serve_utils
17
+
18
+
19
+ def up(
20
+ task: Union['sky.Task', 'sky.Dag'],
21
+ service_name: str,
22
+ pool: bool = False,
23
+ # Internal only:
24
+ # pylint: disable=invalid-name
25
+ _need_confirmation: bool = False
26
+ ) -> server_common.RequestId:
27
+ assert not pool, 'Command `up` is not supported for pool.'
28
+ # Avoid circular import.
29
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
30
+
31
+ dag = dag_utils.convert_entrypoint_to_dag(task)
32
+ with admin_policy_utils.apply_and_use_config_in_current_request(
33
+ dag, at_client_side=True) as dag:
34
+ sdk.validate(dag)
35
+ request_id = sdk.optimize(dag)
36
+ sdk.stream_and_get(request_id)
37
+ if _need_confirmation:
38
+ noun = 'pool' if pool else 'service'
39
+ prompt = f'Launching a new {noun} {service_name!r}. Proceed?'
40
+ if prompt is not None:
41
+ click.confirm(prompt,
42
+ default=True,
43
+ abort=True,
44
+ show_default=True)
45
+
46
+ dag = client_common.upload_mounts_to_api_server(dag)
47
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
48
+
49
+ body = payloads.ServeUpBody(
50
+ task=dag_str,
51
+ service_name=service_name,
52
+ )
53
+
54
+ response = server_common.make_authenticated_request(
55
+ 'POST',
56
+ '/serve/up',
57
+ json=json.loads(body.model_dump_json()),
58
+ timeout=(5, None))
59
+ return server_common.get_request_id(response)
60
+
61
+
62
+ def update(
63
+ task: Union['sky.Task', 'sky.Dag'],
64
+ service_name: str,
65
+ mode: 'serve_utils.UpdateMode',
66
+ pool: bool = False,
67
+ # Internal only:
68
+ # pylint: disable=invalid-name
69
+ _need_confirmation: bool = False
70
+ ) -> server_common.RequestId:
71
+ assert not pool, 'Command `update` is not supported for pool.'
72
+ # Avoid circular import.
73
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
74
+ noun = 'pool' if pool else 'service'
75
+
76
+ dag = dag_utils.convert_entrypoint_to_dag(task)
77
+ with admin_policy_utils.apply_and_use_config_in_current_request(
78
+ dag, at_client_side=True) as dag:
79
+ sdk.validate(dag)
80
+ request_id = sdk.optimize(dag)
81
+ sdk.stream_and_get(request_id)
82
+ if _need_confirmation:
83
+ click.confirm(f'Updating {noun} {service_name!r}. Proceed?',
84
+ default=True,
85
+ abort=True,
86
+ show_default=True)
87
+
88
+ dag = client_common.upload_mounts_to_api_server(dag)
89
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
90
+
91
+ body = payloads.ServeUpdateBody(
92
+ task=dag_str,
93
+ service_name=service_name,
94
+ mode=mode,
95
+ )
96
+
97
+ response = server_common.make_authenticated_request(
98
+ 'POST',
99
+ '/serve/update',
100
+ json=json.loads(body.model_dump_json()),
101
+ timeout=(5, None))
102
+ return server_common.get_request_id(response)
103
+
104
+
105
+ def apply(
106
+ task: Union['sky.Task', 'sky.Dag'],
107
+ service_name: str,
108
+ mode: 'serve_utils.UpdateMode',
109
+ pool: bool = False,
110
+ # Internal only:
111
+ # pylint: disable=invalid-name
112
+ _need_confirmation: bool = False
113
+ ) -> server_common.RequestId:
114
+ assert pool, 'Command `apply` is only supported for pool.'
115
+ # Avoid circular import.
116
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
117
+
118
+ dag = dag_utils.convert_entrypoint_to_dag(task)
119
+ with admin_policy_utils.apply_and_use_config_in_current_request(
120
+ dag, at_client_side=True) as dag:
121
+ sdk.validate(dag)
122
+ request_id = sdk.optimize(dag)
123
+ sdk.stream_and_get(request_id)
124
+ if _need_confirmation:
125
+ noun = 'pool' if pool else 'service'
126
+ prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
127
+ if prompt is not None:
128
+ click.confirm(prompt,
129
+ default=True,
130
+ abort=True,
131
+ show_default=True)
132
+
133
+ dag = client_common.upload_mounts_to_api_server(dag)
134
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
135
+
136
+ body = payloads.JobsPoolApplyBody(
137
+ task=dag_str,
138
+ pool_name=service_name,
139
+ mode=mode,
140
+ )
141
+ response = server_common.make_authenticated_request(
142
+ 'POST',
143
+ '/jobs/pool_apply',
144
+ json=json.loads(body.model_dump_json()),
145
+ timeout=(5, None))
146
+ return server_common.get_request_id(response)
147
+
148
+
149
+ def down(
150
+ service_names: Optional[Union[str, List[str]]],
151
+ all: bool = False, # pylint: disable=redefined-builtin
152
+ purge: bool = False,
153
+ pool: bool = False,
154
+ ) -> server_common.RequestId:
155
+ if pool:
156
+ body = payloads.JobsPoolDownBody(
157
+ pool_names=service_names,
158
+ all=all,
159
+ purge=purge,
160
+ )
161
+ else:
162
+ body = payloads.ServeDownBody(
163
+ service_names=service_names,
164
+ all=all,
165
+ purge=purge,
166
+ )
167
+ response = server_common.make_authenticated_request(
168
+ 'POST',
169
+ '/jobs/pool_down' if pool else '/serve/down',
170
+ json=json.loads(body.model_dump_json()),
171
+ timeout=(5, None))
172
+ return server_common.get_request_id(response)
173
+
174
+
175
+ def status(
176
+ service_names: Optional[Union[str, List[str]]],
177
+ pool: bool = False,
178
+ ) -> server_common.RequestId:
179
+ if pool:
180
+ body = payloads.JobsPoolStatusBody(pool_names=service_names)
181
+ else:
182
+ body = payloads.ServeStatusBody(service_names=service_names)
183
+ response = server_common.make_authenticated_request(
184
+ 'POST',
185
+ '/jobs/pool_status' if pool else '/serve/status',
186
+ json=json.loads(body.model_dump_json()),
187
+ timeout=(5, None))
188
+ return server_common.get_request_id(response)
sky/serve/client/sdk.py CHANGED
@@ -3,16 +3,13 @@ import json
3
3
  import typing
4
4
  from typing import List, Optional, Union
5
5
 
6
- import click
7
-
8
6
  from sky.client import common as client_common
7
+ from sky.serve.client import impl
9
8
  from sky.server import common as server_common
10
9
  from sky.server import rest
11
10
  from sky.server.requests import payloads
12
11
  from sky.usage import usage_lib
13
- from sky.utils import admin_policy_utils
14
12
  from sky.utils import context
15
- from sky.utils import dag_utils
16
13
 
17
14
  if typing.TYPE_CHECKING:
18
15
  import io
@@ -49,37 +46,10 @@ def up(
49
46
  argument.
50
47
  endpoint (str): The service endpoint.
51
48
  """
52
-
53
- # Avoid circular import.
54
- from sky.client import sdk # pylint: disable=import-outside-toplevel
55
-
56
- dag = dag_utils.convert_entrypoint_to_dag(task)
57
- with admin_policy_utils.apply_and_use_config_in_current_request(
58
- dag, at_client_side=True) as dag:
59
- sdk.validate(dag)
60
- request_id = sdk.optimize(dag)
61
- sdk.stream_and_get(request_id)
62
- if _need_confirmation:
63
- prompt = f'Launching a new service {service_name!r}. Proceed?'
64
- if prompt is not None:
65
- click.confirm(prompt,
66
- default=True,
67
- abort=True,
68
- show_default=True)
69
-
70
- dag = client_common.upload_mounts_to_api_server(dag)
71
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
72
-
73
- body = payloads.ServeUpBody(
74
- task=dag_str,
75
- service_name=service_name,
76
- )
77
- response = server_common.make_authenticated_request(
78
- 'POST',
79
- '/serve/up',
80
- json=json.loads(body.model_dump_json()),
81
- timeout=(5, None))
82
- return server_common.get_request_id(response)
49
+ return impl.up(task,
50
+ service_name,
51
+ pool=False,
52
+ _need_confirmation=_need_confirmation)
83
53
 
84
54
 
85
55
  @context.contextual
@@ -112,35 +82,11 @@ def update(
112
82
  Request Returns:
113
83
  None
114
84
  """
115
- # Avoid circular import.
116
- from sky.client import sdk # pylint: disable=import-outside-toplevel
117
-
118
- dag = dag_utils.convert_entrypoint_to_dag(task)
119
- with admin_policy_utils.apply_and_use_config_in_current_request(
120
- dag, at_client_side=True) as dag:
121
- sdk.validate(dag)
122
- request_id = sdk.optimize(dag)
123
- sdk.stream_and_get(request_id)
124
- if _need_confirmation:
125
- click.confirm(f'Updating service {service_name!r}. Proceed?',
126
- default=True,
127
- abort=True,
128
- show_default=True)
129
-
130
- dag = client_common.upload_mounts_to_api_server(dag)
131
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
132
- body = payloads.ServeUpdateBody(
133
- task=dag_str,
134
- service_name=service_name,
135
- mode=mode,
136
- )
137
-
138
- response = server_common.make_authenticated_request(
139
- 'POST',
140
- '/serve/update',
141
- json=json.loads(body.model_dump_json()),
142
- timeout=(5, None))
143
- return server_common.get_request_id(response)
85
+ return impl.update(task,
86
+ service_name,
87
+ mode,
88
+ pool=False,
89
+ _need_confirmation=_need_confirmation)
144
90
 
145
91
 
146
92
  @usage_lib.entrypoint
@@ -171,17 +117,7 @@ def down(
171
117
  ValueError: if the arguments are invalid.
172
118
  RuntimeError: if failed to terminate the service.
173
119
  """
174
- body = payloads.ServeDownBody(
175
- service_names=service_names,
176
- all=all,
177
- purge=purge,
178
- )
179
- response = server_common.make_authenticated_request(
180
- 'POST',
181
- '/serve/down',
182
- json=json.loads(body.model_dump_json()),
183
- timeout=(5, None))
184
- return server_common.get_request_id(response)
120
+ return impl.down(service_names, all, purge, pool=False)
185
121
 
186
122
 
187
123
  @usage_lib.entrypoint
@@ -281,13 +217,7 @@ def status(
281
217
  RuntimeError: if failed to get the service status.
282
218
  exceptions.ClusterNotUpError: if the sky serve controller is not up.
283
219
  """
284
- body = payloads.ServeStatusBody(service_names=service_names,)
285
- response = server_common.make_authenticated_request(
286
- 'POST',
287
- '/serve/status',
288
- json=json.loads(body.model_dump_json()),
289
- timeout=(5, None))
290
- return server_common.get_request_id(response)
220
+ return impl.status(service_names, pool=False)
291
221
 
292
222
 
293
223
  @usage_lib.entrypoint
sky/serve/constants.py CHANGED
@@ -104,8 +104,12 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
104
104
  # Changelog:
105
105
  # v1.0 - Introduce rolling update.
106
106
  # v2.0 - Added template-replica feature.
107
- SERVE_VERSION = 2
107
+ # v3.0 - Added cluster pool.
108
+ SERVE_VERSION = 3
108
109
 
109
110
  TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
110
111
  'The version of service is outdated and does not support manually '
111
112
  'terminating replicas. Please terminate the service and spin up again.')
113
+
114
+ # Dummy run command for cluster pool.
115
+ POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
sky/serve/controller.py CHANGED
@@ -100,6 +100,11 @@ class SkyServeController:
100
100
 
101
101
  def run(self) -> None:
102
102
 
103
+ @self._app.get('/autoscaler/info')
104
+ async def get_autoscaler_info() -> fastapi.Response:
105
+ return responses.JSONResponse(content=self._autoscaler.info(),
106
+ status_code=200)
107
+
103
108
  @self._app.post('/controller/load_balancer_sync')
104
109
  async def load_balancer_sync(
105
110
  request: fastapi.Request) -> fastapi.Response:
@@ -1,4 +1,5 @@
1
1
  """ReplicaManager: handles the creation and deletion of endpoint replicas."""
2
+ import collections
2
3
  import dataclasses
3
4
  import enum
4
5
  import functools
@@ -23,6 +24,7 @@ from sky import execution
23
24
  from sky import global_user_state
24
25
  from sky import sky_logging
25
26
  from sky.backends import backend_utils
27
+ from sky.jobs import scheduler as jobs_scheduler
26
28
  from sky.serve import constants as serve_constants
27
29
  from sky.serve import serve_state
28
30
  from sky.serve import serve_utils
@@ -34,6 +36,7 @@ from sky.usage import usage_lib
34
36
  from sky.utils import common_utils
35
37
  from sky.utils import controller_utils
36
38
  from sky.utils import env_options
39
+ from sky.utils import resources_utils
37
40
  from sky.utils import status_lib
38
41
  from sky.utils import ux_utils
39
42
 
@@ -45,8 +48,6 @@ logger = sky_logging.init_logger(__name__)
45
48
 
46
49
  _JOB_STATUS_FETCH_INTERVAL = 30
47
50
  _PROCESS_POOL_REFRESH_INTERVAL = 20
48
- # TODO(tian): Maybe let user determine this threshold
49
- _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180
50
51
  _RETRY_INIT_GAP_SECONDS = 60
51
52
  _DEFAULT_DRAIN_SECONDS = 120
52
53
 
@@ -180,6 +181,8 @@ def _get_resources_ports(service_task_yaml_path: str) -> str:
180
181
  # Already checked all ports are valid in sky.serve.core.up
181
182
  assert task.resources, task
182
183
  assert task.service is not None, task
184
+ if task.service.pool:
185
+ return '-'
183
186
  assert task.service.ports is not None, task
184
187
  return task.service.ports
185
188
 
@@ -445,8 +448,8 @@ class ReplicaInfo:
445
448
  return None
446
449
  replica_port_int = int(self.replica_port)
447
450
  try:
448
- endpoint_dict = core.endpoints(handle.cluster_name,
449
- replica_port_int)
451
+ endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
452
+ replica_port_int)
450
453
  except exceptions.ClusterNotUpError:
451
454
  return None
452
455
  endpoint = endpoint_dict.get(replica_port_int, None)
@@ -466,7 +469,9 @@ class ReplicaInfo:
466
469
  f'replica {self.replica_id}.')
467
470
  return replica_status
468
471
 
469
- def to_info_dict(self, with_handle: bool) -> Dict[str, Any]:
472
+ def to_info_dict(self,
473
+ with_handle: bool,
474
+ with_url: bool = True) -> Dict[str, Any]:
470
475
  cluster_record = global_user_state.get_cluster_from_name(
471
476
  self.cluster_name)
472
477
  info_dict = {
@@ -474,18 +479,26 @@ class ReplicaInfo:
474
479
  'name': self.cluster_name,
475
480
  'status': self.status,
476
481
  'version': self.version,
477
- 'endpoint': self.url,
482
+ 'endpoint': self.url if with_url else None,
478
483
  'is_spot': self.is_spot,
479
484
  'launched_at': (cluster_record['launched_at']
480
485
  if cluster_record is not None else None),
481
486
  }
482
487
  if with_handle:
483
- info_dict['handle'] = self.handle(cluster_record)
488
+ handle = self.handle(cluster_record)
489
+ info_dict['handle'] = handle
490
+ if handle is not None:
491
+ info_dict['cloud'] = repr(handle.launched_resources.cloud)
492
+ info_dict['region'] = handle.launched_resources.region
493
+ info_dict['resources_str'] = (
494
+ resources_utils.get_readable_resources_repr(handle,
495
+ simplify=True))
484
496
  return info_dict
485
497
 
486
498
  def __repr__(self) -> str:
487
- info_dict = self.to_info_dict(
488
- with_handle=env_options.Options.SHOW_DEBUG_INFO.get())
499
+ show_details = env_options.Options.SHOW_DEBUG_INFO.get()
500
+ info_dict = self.to_info_dict(with_handle=show_details,
501
+ with_url=show_details)
489
502
  handle_str = ''
490
503
  if 'handle' in info_dict:
491
504
  handle_str = f', handle={info_dict["handle"]}'
@@ -499,6 +512,33 @@ class ReplicaInfo:
499
512
  f'launched_at={info_dict["launched_at"]}{handle_str})')
500
513
  return info
501
514
 
515
+ def probe_pool(self) -> Tuple['ReplicaInfo', bool, float]:
516
+ """Probe the replica for pool management.
517
+
518
+ This function will check the first job status of the cluster, which is a
519
+ dummy job that only echoes "setup done". The success of this job means
520
+ the setup command is done and the replica is ready to be used. Check
521
+ sky/serve/server/core.py::up for more details.
522
+
523
+ Returns:
524
+ Tuple of (self, is_ready, probe_time).
525
+ """
526
+ probe_time = time.time()
527
+ try:
528
+ handle = backend_utils.check_cluster_available(
529
+ self.cluster_name, operation='probing pool')
530
+ if handle is None:
531
+ return self, False, probe_time
532
+ backend = backend_utils.get_backend_from_handle(handle)
533
+ statuses = backend.get_job_status(handle, [1], stream_logs=False)
534
+ if statuses[1] == job_lib.JobStatus.SUCCEEDED:
535
+ return self, True, probe_time
536
+ return self, False, probe_time
537
+ except Exception as e: # pylint: disable=broad-except
538
+ logger.error(f'Error when probing pool of {self.cluster_name}: '
539
+ f'{common_utils.format_exception(e)}.')
540
+ return self, False, probe_time
541
+
502
542
  def probe(
503
543
  self,
504
544
  readiness_path: str,
@@ -588,6 +628,7 @@ class ReplicaManager:
588
628
  self._service_name: str = service_name
589
629
  self._uptime: Optional[float] = None
590
630
  self._update_mode = serve_utils.DEFAULT_UPDATE_MODE
631
+ self._is_pool: bool = spec.pool
591
632
  header_keys = None
592
633
  if spec.readiness_headers is not None:
593
634
  header_keys = list(spec.readiness_headers.keys())
@@ -601,6 +642,15 @@ class ReplicaManager:
601
642
  # Oldest version among the currently provisioned and launched replicas
602
643
  self.least_recent_version: int = serve_constants.INITIAL_VERSION
603
644
 
645
+ def _consecutive_failure_threshold_timeout(self) -> int:
646
+ """The timeout for the consecutive failure threshold in seconds.
647
+
648
+ We reduce the timeout for pool to 10 seconds to make the pool more
649
+ responsive to the failure.
650
+ """
651
+ # TODO(tian): Maybe let user determine this threshold
652
+ return 10 if self._is_pool else 180
653
+
604
654
  def scale_up(self,
605
655
  resources_override: Optional[Dict[str, Any]] = None) -> None:
606
656
  """Scale up the service by 1 replica with resources_override.
@@ -822,9 +872,8 @@ class SkyPilotReplicaManager(ReplicaManager):
822
872
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
823
873
  replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
824
874
  'replica_jobs')
825
- job_log_file_name = (
826
- controller_utils.download_and_stream_latest_job_log(
827
- backend, handle, replica_job_logs_dir))
875
+ job_log_file_name = (controller_utils.download_and_stream_job_log(
876
+ backend, handle, replica_job_logs_dir))
828
877
  if job_log_file_name is not None:
829
878
  logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
830
879
  with open(log_file_name, 'a',
@@ -937,6 +986,7 @@ class SkyPilotReplicaManager(ReplicaManager):
937
986
  self._service_name, replica_id)
938
987
  assert info is not None, replica_id
939
988
  error_in_sky_launch = False
989
+ schedule_next_jobs = False
940
990
  if info.status == serve_state.ReplicaStatus.PENDING:
941
991
  # sky.launch not started yet
942
992
  if (serve_state.total_number_provisioning_replicas() <
@@ -965,6 +1015,7 @@ class SkyPilotReplicaManager(ReplicaManager):
965
1015
  else:
966
1016
  info.status_property.sky_launch_status = (
967
1017
  ProcessStatus.SUCCEEDED)
1018
+ schedule_next_jobs = True
968
1019
  if self._spot_placer is not None and info.is_spot:
969
1020
  # TODO(tian): Currently, we set the location to
970
1021
  # preemptive if the launch process failed. This is
@@ -984,6 +1035,9 @@ class SkyPilotReplicaManager(ReplicaManager):
984
1035
  self._spot_placer.set_active(location)
985
1036
  serve_state.add_or_update_replica(self._service_name,
986
1037
  replica_id, info)
1038
+ if schedule_next_jobs and self._is_pool:
1039
+ jobs_scheduler.maybe_schedule_next_jobs(
1040
+ pool=self._service_name)
987
1041
  if error_in_sky_launch:
988
1042
  # Teardown after update replica info since
989
1043
  # _terminate_replica will update the replica info too.
@@ -1100,9 +1154,10 @@ class SkyPilotReplicaManager(ReplicaManager):
1100
1154
  handle = info.handle()
1101
1155
  assert handle is not None, info
1102
1156
  # Use None to fetch latest job, which stands for user task job
1157
+ job_ids = [1] if self._is_pool else None
1103
1158
  try:
1104
1159
  job_statuses = backend.get_job_status(handle,
1105
- None,
1160
+ job_ids,
1106
1161
  stream_logs=False)
1107
1162
  except exceptions.CommandError:
1108
1163
  # If the job status fetch failed, it is likely that the
@@ -1112,7 +1167,8 @@ class SkyPilotReplicaManager(ReplicaManager):
1112
1167
  continue
1113
1168
  # Re-raise the exception if it is not preempted.
1114
1169
  raise
1115
- job_status = list(job_statuses.values())[0]
1170
+ job_status = job_statuses[1] if self._is_pool else list(
1171
+ job_statuses.values())[0]
1116
1172
  if job_status in job_lib.JobStatus.user_code_failure_states():
1117
1173
  info.status_property.user_app_failed = True
1118
1174
  serve_state.add_or_update_replica(self._service_name,
@@ -1156,18 +1212,24 @@ class SkyPilotReplicaManager(ReplicaManager):
1156
1212
  for info in infos:
1157
1213
  if not info.status_property.should_track_service_status():
1158
1214
  continue
1159
- replica_to_probe.append(
1160
- f'replica_{info.replica_id}(url={info.url})')
1161
- probe_futures.append(
1162
- pool.apply_async(
1163
- info.probe,
1164
- (
1165
- self._get_readiness_path(info.version),
1166
- self._get_post_data(info.version),
1167
- self._get_readiness_timeout_seconds(info.version),
1168
- self._get_readiness_headers(info.version),
1169
- ),
1170
- ),)
1215
+ if self._is_pool:
1216
+ replica_to_probe.append(f'replica_{info.replica_id}(cluster'
1217
+ f'_name={info.cluster_name})')
1218
+ probe_futures.append(pool.apply_async(info.probe_pool))
1219
+ else:
1220
+ replica_to_probe.append(
1221
+ f'replica_{info.replica_id}(url={info.url})')
1222
+ probe_futures.append(
1223
+ pool.apply_async(
1224
+ info.probe,
1225
+ (
1226
+ self._get_readiness_path(info.version),
1227
+ self._get_post_data(info.version),
1228
+ self._get_readiness_timeout_seconds(
1229
+ info.version),
1230
+ self._get_readiness_headers(info.version),
1231
+ ),
1232
+ ),)
1171
1233
  logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}')
1172
1234
 
1173
1235
  # Since futures.as_completed will return futures in the order of
@@ -1204,8 +1266,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1204
1266
  consecutive_failure_time = (
1205
1267
  info.consecutive_failure_times[-1] -
1206
1268
  info.consecutive_failure_times[0])
1207
- if (consecutive_failure_time >=
1208
- _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT):
1269
+ failure_threshold = (
1270
+ self._consecutive_failure_threshold_timeout())
1271
+ if consecutive_failure_time >= failure_threshold:
1209
1272
  logger.info(
1210
1273
  f'Replica {info.replica_id} is not ready for '
1211
1274
  'too long and exceeding consecutive failure '
@@ -1216,8 +1279,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1216
1279
  f'Replica {info.replica_id} is not ready '
1217
1280
  'but within consecutive failure threshold '
1218
1281
  f'({consecutive_failure_time}s / '
1219
- f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). '
1220
- 'Skipping.')
1282
+ f'{failure_threshold}s). Skipping.')
1221
1283
  else:
1222
1284
  initial_delay_seconds = self._get_initial_delay_seconds(
1223
1285
  info.version)
@@ -1310,8 +1372,10 @@ class SkyPilotReplicaManager(ReplicaManager):
1310
1372
  # are not empty.
1311
1373
  if new_config.get('file_mounts', None) != {}:
1312
1374
  return
1313
- for key in ['service']:
1314
- new_config.pop(key)
1375
+ for key in ['service', 'pool', '_user_specified_yaml']:
1376
+ new_config.pop(key, None)
1377
+ new_config_any_of = new_config.get('resources', {}).pop('any_of', [])
1378
+
1315
1379
  replica_infos = serve_state.get_replica_infos(self._service_name)
1316
1380
  for info in replica_infos:
1317
1381
  if info.version < version and not info.is_terminal:
@@ -1321,17 +1385,24 @@ class SkyPilotReplicaManager(ReplicaManager):
1321
1385
  self._service_name, info.version))
1322
1386
  old_config = common_utils.read_yaml(
1323
1387
  os.path.expanduser(old_service_task_yaml_path))
1324
- for key in ['service']:
1325
- old_config.pop(key)
1388
+ for key in ['service', 'pool', '_user_specified_yaml']:
1389
+ old_config.pop(key, None)
1326
1390
  # Bump replica version if all fields except for service are
1327
1391
  # the same.
1328
1392
  # Here, we manually convert the any_of field to a set to avoid
1329
1393
  # only the difference in the random order of the any_of fields.
1330
1394
  old_config_any_of = old_config.get('resources',
1331
1395
  {}).pop('any_of', [])
1332
- new_config_any_of = new_config.get('resources',
1333
- {}).pop('any_of', [])
1334
- if set(old_config_any_of) != set(new_config_any_of):
1396
+
1397
+ def normalize_dict_list(lst):
1398
+ return collections.Counter(
1399
+ frozenset(d.items()) for d in lst)
1400
+
1401
+ if (normalize_dict_list(old_config_any_of) !=
1402
+ normalize_dict_list(new_config_any_of)):
1403
+ logger.info('Replica config changed (any_of), skipping. '
1404
+ f'old: {old_config_any_of}, '
1405
+ f'new: {new_config_any_of}')
1335
1406
  continue
1336
1407
  # File mounts should both be empty, as update always
1337
1408
  # create new buckets if they are not empty.
@@ -1345,6 +1416,10 @@ class SkyPilotReplicaManager(ReplicaManager):
1345
1416
  info.version = version
1346
1417
  serve_state.add_or_update_replica(self._service_name,
1347
1418
  info.replica_id, info)
1419
+ else:
1420
+ logger.info('Replica config changed (rest), skipping. '
1421
+ f'old: {old_config}, '
1422
+ f'new: {new_config}')
1348
1423
 
1349
1424
  def _get_version_spec(self, version: int) -> 'service_spec.SkyServiceSpec':
1350
1425
  spec = serve_state.get_spec(self._service_name, version)