skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/logs/agent.py CHANGED
@@ -38,7 +38,7 @@ class FluentbitAgent(LoggingAgent):
38
38
  'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
39
39
  'sudo apt-get update; sudo apt-get install -y gnupg; '
40
40
  # pylint: disable=line-too-long
41
- 'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
41
+ 'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
42
42
  # pylint: disable=line-too-long
43
43
  'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
44
44
  # pylint: disable=line-too-long
sky/metrics/utils.py CHANGED
@@ -48,8 +48,15 @@ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
48
48
  'sky_apiserver_code_duration_seconds',
49
49
  'Time spent processing code',
50
50
  ['name', 'group'],
51
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
52
- 60.0, 120.0, float('inf')),
51
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
52
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
53
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
54
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
55
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
56
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
57
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
58
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
59
+ 960.0, 980.0, 1000.0, float('inf')),
53
60
  )
54
61
 
55
62
  # Total number of API server requests, grouped by path, method, and status.
@@ -65,16 +72,30 @@ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
65
72
  'sky_apiserver_request_duration_seconds',
66
73
  'Time spent processing API server requests',
67
74
  ['path', 'method', 'status'],
68
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
69
- 60.0, 120.0, float('inf')),
75
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
76
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
77
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
78
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
79
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
80
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
81
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
82
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
83
+ 960.0, 980.0, 1000.0, float('inf')),
70
84
  )
71
85
 
72
86
  SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
73
87
  'sky_apiserver_event_loop_lag_seconds',
74
88
  'Scheduling delay of the server event loop',
75
89
  ['pid'],
76
- buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
77
- 60.0, float('inf')),
90
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
91
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
92
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
93
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
94
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
95
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
96
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
97
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
98
+ 960.0, 980.0, 1000.0, float('inf')),
78
99
  )
79
100
 
80
101
  SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
@@ -122,6 +143,24 @@ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
122
143
  'RSS increment after requests', ['name'],
123
144
  buckets=_MEM_BUCKETS)
124
145
 
146
+ SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
147
+ 'sky_apiserver_websocket_ssh_latency_seconds',
148
+ ('Time taken for ssh message to go from client to API server and back'
149
+ 'to the client. This does not include: latency to reach the pod, '
150
+ 'overhead from sending through the k8s port-forward tunnel, or '
151
+ 'ssh server lag on the destination pod.'),
152
+ ['pid'],
153
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
154
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
155
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
156
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
157
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
158
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
159
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
160
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
161
+ 960.0, 980.0, 1000.0, float('inf')),
162
+ )
163
+
125
164
 
126
165
  @contextlib.contextmanager
127
166
  def time_it(name: str, group: str = 'default'):
sky/optimizer.py CHANGED
@@ -1019,7 +1019,7 @@ class Optimizer:
1019
1019
  if res.instance_type is not None
1020
1020
  ])
1021
1021
  candidate_str = resources_utils.format_resource(
1022
- best_resources, simplify=True)
1022
+ best_resources, simplified_only=True)[0]
1023
1023
 
1024
1024
  logger.info(
1025
1025
  f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
sky/provision/__init__.py CHANGED
@@ -28,6 +28,7 @@ from sky.provision import primeintellect
28
28
  from sky.provision import runpod
29
29
  from sky.provision import scp
30
30
  from sky.provision import seeweb
31
+ from sky.provision import shadeform
31
32
  from sky.provision import ssh
32
33
  from sky.provision import vast
33
34
  from sky.provision import vsphere
@@ -79,6 +80,7 @@ def query_instances(
79
80
  cluster_name_on_cloud: str,
80
81
  provider_config: Optional[Dict[str, Any]] = None,
81
82
  non_terminated_only: bool = True,
83
+ retry_if_missing: bool = False,
82
84
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
83
85
  """Query instances.
84
86
 
@@ -87,6 +89,11 @@ def query_instances(
87
89
 
88
90
  A None status means the instance is marked as "terminated"
89
91
  or "terminating".
92
+
93
+ Args:
94
+ retry_if_missing: Whether to retry the call to the cloud api if the
95
+ cluster is not found when querying the live status on the cloud.
96
+ NOTE: This is currently only used on kubernetes.
90
97
  """
91
98
  raise NotImplementedError
92
99
 
@@ -630,9 +630,10 @@ def query_instances(
630
630
  cluster_name_on_cloud: str,
631
631
  provider_config: Optional[Dict[str, Any]] = None,
632
632
  non_terminated_only: bool = True,
633
+ retry_if_missing: bool = False,
633
634
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
634
635
  """See sky/provision/__init__.py"""
635
- del cluster_name # unused
636
+ del cluster_name, retry_if_missing # unused
636
637
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
637
638
  region = provider_config['region']
638
639
  ec2 = _default_ec2_resource(region)
@@ -957,9 +957,10 @@ def query_instances(
957
957
  cluster_name_on_cloud: str,
958
958
  provider_config: Optional[Dict[str, Any]] = None,
959
959
  non_terminated_only: bool = True,
960
+ retry_if_missing: bool = False,
960
961
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
961
962
  """See sky/provision/__init__.py"""
962
- del cluster_name # unused
963
+ del cluster_name, retry_if_missing # unused
963
964
  assert provider_config is not None, cluster_name_on_cloud
964
965
 
965
966
  subscription_id = provider_config['subscription_id']
sky/provision/common.py CHANGED
@@ -97,6 +97,8 @@ class InstanceInfo:
97
97
  external_ip: Optional[str]
98
98
  tags: Dict[str, str]
99
99
  ssh_port: int = 22
100
+ # The internal service address of the instance on Kubernetes.
101
+ internal_svc: Optional[str] = None
100
102
 
101
103
  def get_feasible_ip(self) -> str:
102
104
  """Get the most feasible IPs of the instance. This function returns
@@ -195,9 +195,10 @@ def query_instances(
195
195
  cluster_name_on_cloud: str,
196
196
  provider_config: Optional[Dict[str, Any]] = None,
197
197
  non_terminated_only: bool = True,
198
+ retry_if_missing: bool = False,
198
199
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
199
200
  """See sky/provision/__init__.py"""
200
- del cluster_name # unused
201
+ del cluster_name, retry_if_missing # unused
201
202
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
202
203
  instances = _filter_instances(cluster_name_on_cloud, None)
203
204
 
@@ -246,9 +246,10 @@ def query_instances(
246
246
  cluster_name_on_cloud: str,
247
247
  provider_config: Optional[Dict[str, Any]] = None,
248
248
  non_terminated_only: bool = True,
249
+ retry_if_missing: bool = False,
249
250
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
250
251
  """See sky/provision/__init__.py"""
251
- del cluster_name # unused
252
+ del cluster_name, retry_if_missing # unused
252
253
  # terminated instances are not retrieved by the
253
254
  # API making `non_terminated_only` argument moot.
254
255
  del non_terminated_only
@@ -3,11 +3,11 @@ import os
3
3
  import time
4
4
  from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
- from sky import authentication as auth
7
6
  from sky import exceptions
8
7
  from sky import sky_logging
9
8
  from sky.provision import common
10
9
  from sky.provision.fluidstack import fluidstack_utils as utils
10
+ from sky.utils import auth_utils
11
11
  from sky.utils import command_runner
12
12
  from sky.utils import common_utils
13
13
  from sky.utils import status_lib
@@ -27,7 +27,7 @@ logger = sky_logging.init_logger(__name__)
27
27
  def get_internal_ip(node_info: Dict[str, Any]) -> None:
28
28
  node_info['internal_ip'] = node_info['ip_address']
29
29
 
30
- private_key_path, _ = auth.get_or_generate_keys()
30
+ private_key_path, _ = auth_utils.get_or_generate_keys()
31
31
  runner = command_runner.SSHCommandRunner(
32
32
  (node_info['ip_address'], 22),
33
33
  ssh_user='ubuntu',
@@ -291,9 +291,10 @@ def query_instances(
291
291
  cluster_name_on_cloud: str,
292
292
  provider_config: Optional[Dict[str, Any]] = None,
293
293
  non_terminated_only: bool = True,
294
+ retry_if_missing: bool = False,
294
295
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
295
296
  """See sky/provision/__init__.py"""
296
- del cluster_name # unused
297
+ del cluster_name, retry_if_missing # unused
297
298
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
298
299
  instances = _filter_instances(cluster_name_on_cloud, None)
299
300
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -62,9 +62,10 @@ def query_instances(
62
62
  cluster_name_on_cloud: str,
63
63
  provider_config: Optional[Dict[str, Any]] = None,
64
64
  non_terminated_only: bool = True,
65
+ retry_if_missing: bool = False,
65
66
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
66
67
  """See sky/provision/__init__.py"""
67
- del cluster_name # unused
68
+ del cluster_name, retry_if_missing # unused
68
69
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
69
70
  zone = provider_config['availability_zone']
70
71
  project_id = provider_config['project_id']
@@ -309,9 +309,10 @@ def query_instances(
309
309
  cluster_name_on_cloud: str,
310
310
  provider_config: Optional[dict] = None,
311
311
  non_terminated_only: bool = True,
312
+ retry_if_missing: bool = False,
312
313
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
313
314
  """Returns the status of the specified instances for Hyperbolic."""
314
- del cluster_name, provider_config # unused
315
+ del cluster_name, provider_config, retry_if_missing # unused
315
316
  # Fetch all instances for this cluster
316
317
  instances = utils.list_instances(
317
318
  metadata={'skypilot': {
@@ -434,8 +434,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
434
434
  # use the external IP of the head node.
435
435
  use_external_ip = cluster_info.custom_ray_options.pop(
436
436
  'use_external_ip', False)
437
- head_ip = (head_instance.internal_ip
438
- if not use_external_ip else head_instance.external_ip)
437
+
438
+ if use_external_ip:
439
+ head_ip = head_instance.external_ip
440
+ else:
441
+ # For Kubernetes, use the internal service address of the head node.
442
+ # Keep this consistent with the logic in kubernetes-ray.yml.j2
443
+ if head_instance.internal_svc:
444
+ head_ip = head_instance.internal_svc
445
+ else:
446
+ head_ip = head_instance.internal_ip
439
447
 
440
448
  ray_cmd = ray_worker_start_command(custom_resource,
441
449
  cluster_info.custom_ray_options,
@@ -18,7 +18,6 @@ SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
18
18
 
19
19
  # Labels for the Pods created by SkyPilot
20
20
  TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
21
- TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
22
21
  TAG_POD_INITIALIZED = 'skypilot-initialized'
23
22
  TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
24
23