skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/skylet/services.py CHANGED
@@ -1,14 +1,36 @@
1
1
  """gRPC service implementations for skylet."""
2
2
 
3
+ import os
4
+ from typing import List, Optional
5
+
3
6
  import grpc
4
7
 
8
+ from sky import exceptions
5
9
  from sky import sky_logging
10
+ from sky.jobs import state as managed_job_state
11
+ from sky.jobs import utils as managed_job_utils
6
12
  from sky.schemas.generated import autostopv1_pb2
7
13
  from sky.schemas.generated import autostopv1_pb2_grpc
14
+ from sky.schemas.generated import jobsv1_pb2
15
+ from sky.schemas.generated import jobsv1_pb2_grpc
16
+ from sky.schemas.generated import managed_jobsv1_pb2
17
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
18
+ from sky.schemas.generated import servev1_pb2
19
+ from sky.schemas.generated import servev1_pb2_grpc
20
+ from sky.serve import serve_rpc_utils
21
+ from sky.serve import serve_state
22
+ from sky.serve import serve_utils
8
23
  from sky.skylet import autostop_lib
24
+ from sky.skylet import constants
25
+ from sky.skylet import job_lib
26
+ from sky.skylet import log_lib
9
27
 
10
28
  logger = sky_logging.init_logger(__name__)
11
29
 
30
+ # In the worst case, flush the log buffer every 50ms,
31
+ # to ensure responsiveness.
32
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
33
+
12
34
 
13
35
  class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
14
36
  """Implementation of the AutostopService gRPC service."""
@@ -42,3 +64,505 @@ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
42
64
  is_autostopping=is_autostopping)
43
65
  except Exception as e: # pylint: disable=broad-except
44
66
  context.abort(grpc.StatusCode.INTERNAL, str(e))
67
+
68
+
69
+ class ServeServiceImpl(servev1_pb2_grpc.ServeServiceServicer):
70
+ """Implementation of the ServeService gRPC service."""
71
+
72
+ # NOTE (kyuds): this grpc service will run cluster-side,
73
+ # thus guaranteeing that SERVE_VERSION is above 5.
74
+ # Therefore, we removed some SERVE_VERSION checks
75
+ # present in the original codegen.
76
+
77
+ def GetServiceStatus( # type: ignore[return]
78
+ self, request: servev1_pb2.GetServiceStatusRequest,
79
+ context: grpc.ServicerContext
80
+ ) -> servev1_pb2.GetServiceStatusResponse:
81
+ """Gets serve status."""
82
+ try:
83
+ service_names, pool = (
84
+ serve_rpc_utils.GetServiceStatusRequestConverter.from_proto(request)) # pylint: disable=line-too-long
85
+ statuses = serve_utils.get_service_status_pickled(
86
+ service_names, pool)
87
+ return serve_rpc_utils.GetServiceStatusResponseConverter.to_proto(
88
+ statuses)
89
+ except Exception as e: # pylint: disable=broad-except
90
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
91
+
92
+ def AddVersion( # type: ignore[return]
93
+ self, request: servev1_pb2.AddVersionRequest,
94
+ context: grpc.ServicerContext) -> servev1_pb2.AddVersionResponse:
95
+ """Adds serve version"""
96
+ try:
97
+ service_name = request.service_name
98
+ version = serve_state.add_version(service_name)
99
+ return servev1_pb2.AddVersionResponse(version=version)
100
+ except Exception as e: # pylint: disable=broad-except
101
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
102
+
103
+ def TerminateServices( # type: ignore[return]
104
+ self, request: servev1_pb2.TerminateServicesRequest,
105
+ context: grpc.ServicerContext
106
+ ) -> servev1_pb2.TerminateServicesResponse:
107
+ """Terminates serve"""
108
+ try:
109
+ service_names, purge, pool = (
110
+ serve_rpc_utils.TerminateServicesRequestConverter.from_proto(request)) # pylint: disable=line-too-long
111
+ message = serve_utils.terminate_services(service_names, purge, pool)
112
+ return servev1_pb2.TerminateServicesResponse(message=message)
113
+ except Exception as e: # pylint: disable=broad-except
114
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
115
+
116
+ def TerminateReplica( # type: ignore[return]
117
+ self, request: servev1_pb2.TerminateReplicaRequest,
118
+ context: grpc.ServicerContext
119
+ ) -> servev1_pb2.TerminateReplicaResponse:
120
+ """Terminate replica"""
121
+ try:
122
+ service_name = request.service_name
123
+ replica_id = request.replica_id
124
+ purge = request.purge
125
+ message = serve_utils.terminate_replica(service_name, replica_id,
126
+ purge)
127
+ return servev1_pb2.TerminateReplicaResponse(message=message)
128
+ except Exception as e: # pylint: disable=broad-except
129
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
130
+
131
+ def WaitServiceRegistration( # type: ignore[return]
132
+ self, request: servev1_pb2.WaitServiceRegistrationRequest,
133
+ context: grpc.ServicerContext
134
+ ) -> servev1_pb2.WaitServiceRegistrationResponse:
135
+ """Wait for service to be registered"""
136
+ try:
137
+ service_name = request.service_name
138
+ job_id = request.job_id
139
+ pool = request.pool
140
+ encoded = serve_utils.wait_service_registration(
141
+ service_name, job_id, pool)
142
+ lb_port = serve_utils.load_service_initialization_result(encoded)
143
+ return servev1_pb2.WaitServiceRegistrationResponse(lb_port=lb_port)
144
+ except Exception as e: # pylint: disable=broad-except
145
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
146
+
147
+ def UpdateService( # type: ignore[return]
148
+ self, request: servev1_pb2.UpdateServiceRequest,
149
+ context: grpc.ServicerContext) -> servev1_pb2.UpdateServiceResponse:
150
+ """Update service"""
151
+ try:
152
+ service_name = request.service_name
153
+ version = request.version
154
+ mode = request.mode
155
+ pool = request.pool
156
+ serve_utils.update_service_encoded(service_name, version, mode,
157
+ pool)
158
+ return servev1_pb2.UpdateServiceResponse()
159
+ except Exception as e: # pylint: disable=broad-except
160
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
161
+
162
+
163
+ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
164
+ """Implementation of the JobsService gRPC service."""
165
+
166
+ def AddJob( # type: ignore[return]
167
+ self, request: jobsv1_pb2.AddJobRequest,
168
+ context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
169
+ try:
170
+ job_name = request.job_name if request.HasField('job_name') else '-'
171
+ job_id, log_dir = job_lib.add_job(job_name, request.username,
172
+ request.run_timestamp,
173
+ request.resources_str,
174
+ request.metadata)
175
+ return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
176
+ except Exception as e: # pylint: disable=broad-except
177
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
178
+
179
+ def QueueJob( # type: ignore[return]
180
+ self, request: jobsv1_pb2.QueueJobRequest,
181
+ context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
182
+ try:
183
+ job_id = request.job_id
184
+ # Create log directory and file
185
+ remote_log_dir = os.path.expanduser(request.remote_log_dir)
186
+ os.makedirs(remote_log_dir, exist_ok=True)
187
+ remote_log_path = os.path.join(remote_log_dir, 'run.log')
188
+ open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
189
+
190
+ script_path = os.path.expanduser(request.script_path)
191
+ os.makedirs(os.path.dirname(script_path), exist_ok=True)
192
+
193
+ # If `codegen` is not provided, assume script is already
194
+ # uploaded to `script_path` via rsync.
195
+ if request.HasField('codegen'):
196
+ with open(script_path, 'w', encoding='utf-8') as f:
197
+ f.write(request.codegen)
198
+ os.chmod(script_path, 0o755)
199
+
200
+ cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
201
+ job_submit_cmd = (
202
+ # JOB_CMD_IDENTIFIER is used for identifying the process
203
+ # retrieved with pid is the same driver process.
204
+ f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
205
+ f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
206
+ # Do not use &>, which is not POSIX and may not work.
207
+ # Note that the order of ">filename 2>&1" matters.
208
+ f' > {remote_log_path} 2>&1')
209
+ job_lib.scheduler.queue(job_id, job_submit_cmd)
210
+
211
+ if request.HasField('managed_job'):
212
+ managed_job = request.managed_job
213
+ pool = managed_job.pool if managed_job.HasField(
214
+ 'pool') else None
215
+ pool_hash = None
216
+ if pool is not None:
217
+ pool_hash = serve_state.get_service_hash(pool)
218
+ # Add the managed job to job queue database.
219
+ user_id = managed_job.user_id if managed_job.HasField(
220
+ 'user_id') else None
221
+ managed_job_state.set_job_info(job_id, managed_job.name,
222
+ managed_job.workspace,
223
+ managed_job.entrypoint, pool,
224
+ pool_hash, user_id)
225
+ # Set the managed job to PENDING state to make sure that
226
+ # this managed job appears in the `sky jobs queue`, even
227
+ # if it needs to wait to be submitted.
228
+ # We cannot set the managed job to PENDING state in the
229
+ # job template (jobs-controller.yaml.j2), as it may need
230
+ # to wait for the run commands to be scheduled on the job
231
+ # controller in high-load cases.
232
+ for task in managed_job.tasks:
233
+ managed_job_state.set_pending(job_id, task.task_id,
234
+ task.name, task.resources_str,
235
+ task.metadata_json)
236
+ return jobsv1_pb2.QueueJobResponse()
237
+ except Exception as e: # pylint: disable=broad-except
238
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
239
+
240
+ def UpdateStatus( # type: ignore[return]
241
+ self, request: jobsv1_pb2.UpdateStatusRequest,
242
+ context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
243
+ try:
244
+ job_lib.update_status()
245
+ return jobsv1_pb2.UpdateStatusResponse()
246
+ except Exception as e: # pylint: disable=broad-except
247
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
248
+
249
+ def GetJobQueue( # type: ignore[return]
250
+ self, request: jobsv1_pb2.GetJobQueueRequest,
251
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
252
+ try:
253
+ user_hash = request.user_hash if request.HasField(
254
+ 'user_hash') else None
255
+ all_jobs = request.all_jobs
256
+ jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
257
+ all_jobs=all_jobs)
258
+ return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
259
+ except Exception as e: # pylint: disable=broad-except
260
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
261
+
262
+ def CancelJobs( # type: ignore[return]
263
+ self, request: jobsv1_pb2.CancelJobsRequest,
264
+ context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
265
+ try:
266
+ job_ids = list(request.job_ids) if request.job_ids else []
267
+ user_hash = request.user_hash if request.HasField(
268
+ 'user_hash') else None
269
+ cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
270
+ user_hash)
271
+ return jobsv1_pb2.CancelJobsResponse(
272
+ cancelled_job_ids=cancelled_job_ids)
273
+ except Exception as e: # pylint: disable=broad-except
274
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
275
+
276
+ def FailAllInProgressJobs( # type: ignore[return]
277
+ self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
278
+ context: grpc.ServicerContext
279
+ ) -> jobsv1_pb2.FailAllInProgressJobsResponse:
280
+ try:
281
+ job_lib.fail_all_jobs_in_progress()
282
+ return jobsv1_pb2.FailAllInProgressJobsResponse()
283
+ except Exception as e: # pylint: disable=broad-except
284
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
285
+
286
+ def TailLogs(
287
+ self,
288
+ request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
289
+ context: grpc.ServicerContext):
290
+ buffer = log_lib.LogBuffer()
291
+ try:
292
+ job_id = request.job_id if request.HasField(
293
+ 'job_id') else job_lib.get_latest_job_id()
294
+ managed_job_id = request.managed_job_id if request.HasField(
295
+ 'managed_job_id') else None
296
+ log_dir = job_lib.get_log_dir_for_job(job_id)
297
+ if log_dir is None:
298
+ run_timestamp = job_lib.get_run_timestamp(job_id)
299
+ log_dir = None if run_timestamp is None else os.path.join(
300
+ constants.SKY_LOGS_DIRECTORY, run_timestamp)
301
+
302
+ for line in log_lib.buffered_iter_with_timeout(
303
+ buffer,
304
+ log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
305
+ request.follow, request.tail),
306
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
307
+ yield jobsv1_pb2.TailLogsResponse(log_line=line)
308
+
309
+ job_status = job_lib.get_status(job_id)
310
+ exit_code = exceptions.JobExitCode.from_job_status(job_status)
311
+ # Fix for dashboard: When follow=False and job is still running
312
+ # (NOT_FINISHED=101), exit with success (0) since fetching current
313
+ # logs is a successful operation.
314
+ # This prevents shell wrappers from printing "command terminated
315
+ # with exit code 101".
316
+ exit_code_int = 0 if not request.follow and int(
317
+ exit_code) == 101 else int(exit_code)
318
+ yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
319
+ except Exception as e: # pylint: disable=broad-except
320
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
321
+ finally:
322
+ buffer.close()
323
+
324
+ def GetJobStatus( # type: ignore[return]
325
+ self, request: jobsv1_pb2.GetJobStatusRequest,
326
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
327
+ try:
328
+ if request.job_ids:
329
+ job_ids = list(request.job_ids)
330
+ else:
331
+ latest_job_id = job_lib.get_latest_job_id()
332
+ job_ids = [latest_job_id] if latest_job_id is not None else []
333
+ job_statuses = job_lib.get_statuses(job_ids)
334
+ for job_id, status in job_statuses.items():
335
+ job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
336
+ ) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
337
+ return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
338
+ except Exception as e: # pylint: disable=broad-except
339
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
340
+
341
+ def GetJobSubmittedTimestamp( # type: ignore[return]
342
+ self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
343
+ context: grpc.ServicerContext
344
+ ) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
345
+ try:
346
+ job_id = request.job_id if request.HasField(
347
+ 'job_id') else job_lib.get_latest_job_id()
348
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
349
+ job_id, False)
350
+ if timestamp is None:
351
+ context.abort(grpc.StatusCode.NOT_FOUND,
352
+ f'Job {job_id} not found')
353
+ return jobsv1_pb2.GetJobSubmittedTimestampResponse(
354
+ timestamp=timestamp)
355
+ except Exception as e: # pylint: disable=broad-except
356
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
357
+
358
+ def GetJobEndedTimestamp( # type: ignore[return]
359
+ self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
360
+ context: grpc.ServicerContext
361
+ ) -> jobsv1_pb2.GetJobEndedTimestampResponse:
362
+ try:
363
+ job_id = request.job_id if request.HasField(
364
+ 'job_id') else job_lib.get_latest_job_id()
365
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
366
+ job_id, True)
367
+ if timestamp is None:
368
+ context.abort(grpc.StatusCode.NOT_FOUND,
369
+ f'Job {job_id} not found or not ended')
370
+ return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
371
+ except Exception as e: # pylint: disable=broad-except
372
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
373
+
374
+ def GetLogDirsForJobs( # type: ignore[return]
375
+ self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
376
+ context: grpc.ServicerContext
377
+ ) -> jobsv1_pb2.GetLogDirsForJobsResponse:
378
+ try:
379
+ if request.job_ids:
380
+ job_ids = list(request.job_ids)
381
+ else:
382
+ latest_job_id = job_lib.get_latest_job_id()
383
+ job_ids = [latest_job_id] if latest_job_id is not None else []
384
+ job_log_dirs = job_lib.get_job_log_dirs(job_ids)
385
+ return jobsv1_pb2.GetLogDirsForJobsResponse(
386
+ job_log_dirs=job_log_dirs)
387
+ except Exception as e: # pylint: disable=broad-except
388
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
389
+
390
+
391
+ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
392
+ ):
393
+ """Implementation of the ManagedJobsService gRPC service."""
394
+
395
+ def GetVersion( # type: ignore[return]
396
+ self, request: managed_jobsv1_pb2.GetVersionRequest,
397
+ context: grpc.ServicerContext
398
+ ) -> managed_jobsv1_pb2.GetVersionResponse:
399
+ try:
400
+ return managed_jobsv1_pb2.GetVersionResponse(
401
+ controller_version=constants.SKYLET_VERSION)
402
+ except Exception as e: # pylint: disable=broad-except
403
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
404
+
405
+ def GetJobTable( # type: ignore[return]
406
+ self, request: managed_jobsv1_pb2.GetJobTableRequest,
407
+ context: grpc.ServicerContext
408
+ ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
+ try:
410
+ accessible_workspaces = (
411
+ list(request.accessible_workspaces.workspaces)
412
+ if request.HasField('accessible_workspaces') else None)
413
+ job_ids = (list(request.job_ids.ids)
414
+ if request.HasField('job_ids') else None)
415
+ user_hashes: Optional[List[Optional[str]]] = None
416
+ if request.HasField('user_hashes'):
417
+ user_hashes = list(request.user_hashes.hashes)
418
+ # For backwards compatibility, we show jobs that do not have a
419
+ # user_hash. TODO: Remove before 0.12.0.
420
+ if request.show_jobs_without_user_hash:
421
+ user_hashes.append(None)
422
+ statuses = (list(request.statuses.statuses)
423
+ if request.HasField('statuses') else None)
424
+ fields = (list(request.fields.fields)
425
+ if request.HasField('fields') else None)
426
+ job_queue = managed_job_utils.get_managed_job_queue(
427
+ skip_finished=request.skip_finished,
428
+ accessible_workspaces=accessible_workspaces,
429
+ job_ids=job_ids,
430
+ workspace_match=request.workspace_match
431
+ if request.HasField('workspace_match') else None,
432
+ name_match=request.name_match
433
+ if request.HasField('name_match') else None,
434
+ pool_match=request.pool_match
435
+ if request.HasField('pool_match') else None,
436
+ page=request.page if request.HasField('page') else None,
437
+ limit=request.limit if request.HasField('limit') else None,
438
+ user_hashes=user_hashes,
439
+ statuses=statuses,
440
+ fields=fields,
441
+ )
442
+ jobs = job_queue['jobs']
443
+ total = job_queue['total']
444
+ total_no_filter = job_queue['total_no_filter']
445
+ status_counts = job_queue['status_counts']
446
+
447
+ jobs_info = []
448
+ for job in jobs:
449
+ converted_metadata = None
450
+ metadata = job.get('metadata')
451
+ if metadata:
452
+ converted_metadata = {
453
+ k: v for k, v in metadata.items() if v is not None
454
+ }
455
+ schedule_state = job.get('schedule_state')
456
+ if schedule_state is not None:
457
+ schedule_state = managed_job_state.ManagedJobScheduleState(
458
+ schedule_state).to_protobuf()
459
+ job_info = managed_jobsv1_pb2.ManagedJobInfo(
460
+ # The `spot.job_id`, which can be used to identify
461
+ # different tasks for the same job
462
+ _job_id=job.get('_job_id'),
463
+ job_id=job.get('job_id'),
464
+ task_id=job.get('task_id'),
465
+ job_name=job.get('job_name'),
466
+ task_name=job.get('task_name'),
467
+ job_duration=job.get('job_duration'),
468
+ workspace=job.get('workspace'),
469
+ status=managed_job_state.ManagedJobStatus(
470
+ job.get('status')).to_protobuf(),
471
+ schedule_state=schedule_state,
472
+ resources=job.get('resources'),
473
+ cluster_resources=job.get('cluster_resources'),
474
+ cluster_resources_full=job.get('cluster_resources_full'),
475
+ cloud=job.get('cloud'),
476
+ region=job.get('region'),
477
+ infra=job.get('infra'),
478
+ accelerators=job.get('accelerators'),
479
+ recovery_count=job.get('recovery_count'),
480
+ details=job.get('details'),
481
+ failure_reason=job.get('failure_reason'),
482
+ user_name=job.get('user_name'),
483
+ user_hash=job.get('user_hash'),
484
+ submitted_at=job.get('submitted_at'),
485
+ start_at=job.get('start_at'),
486
+ end_at=job.get('end_at'),
487
+ user_yaml=job.get('user_yaml'),
488
+ entrypoint=job.get('entrypoint'),
489
+ metadata=converted_metadata,
490
+ pool=job.get('pool'),
491
+ pool_hash=job.get('pool_hash'))
492
+ jobs_info.append(job_info)
493
+
494
+ return managed_jobsv1_pb2.GetJobTableResponse(
495
+ jobs=jobs_info,
496
+ total=total,
497
+ total_no_filter=total_no_filter,
498
+ status_counts=status_counts)
499
+ except Exception as e: # pylint: disable=broad-except
500
+ logger.error(e, exc_info=True)
501
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
502
+
503
+ def GetAllJobIdsByName( # type: ignore[return]
504
+ self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
505
+ context: grpc.ServicerContext
506
+ ) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
507
+ try:
508
+ job_name = request.job_name if request.HasField(
509
+ 'job_name') else None
510
+ job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
511
+ return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
512
+ job_ids=job_ids)
513
+ except Exception as e: # pylint: disable=broad-except
514
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
515
+
516
+ def CancelJobs( # type: ignore[return]
517
+ self, request: managed_jobsv1_pb2.CancelJobsRequest,
518
+ context: grpc.ServicerContext
519
+ ) -> managed_jobsv1_pb2.CancelJobsResponse:
520
+ try:
521
+ cancellation_criteria = request.WhichOneof('cancellation_criteria')
522
+ if cancellation_criteria is None:
523
+ context.abort(
524
+ grpc.StatusCode.INVALID_ARGUMENT,
525
+ 'exactly one cancellation criteria must be specified.')
526
+
527
+ if cancellation_criteria == 'all_users':
528
+ user_hash = request.user_hash if request.HasField(
529
+ 'user_hash') else None
530
+ all_users = request.all_users
531
+ if not all_users and user_hash is None:
532
+ context.abort(
533
+ grpc.StatusCode.INVALID_ARGUMENT,
534
+ 'user_hash is required when all_users is False')
535
+ message = managed_job_utils.cancel_jobs_by_id(
536
+ job_ids=None,
537
+ all_users=all_users,
538
+ current_workspace=request.current_workspace,
539
+ user_hash=user_hash)
540
+ elif cancellation_criteria == 'job_ids':
541
+ job_ids = list(request.job_ids.ids)
542
+ message = managed_job_utils.cancel_jobs_by_id(
543
+ job_ids=job_ids,
544
+ current_workspace=request.current_workspace)
545
+ elif cancellation_criteria == 'job_name':
546
+ message = managed_job_utils.cancel_job_by_name(
547
+ job_name=request.job_name,
548
+ current_workspace=request.current_workspace)
549
+ elif cancellation_criteria == 'pool_name':
550
+ message = managed_job_utils.cancel_jobs_by_pool(
551
+ pool_name=request.pool_name,
552
+ current_workspace=request.current_workspace)
553
+ else:
554
+ context.abort(
555
+ grpc.StatusCode.INVALID_ARGUMENT,
556
+ f'invalid cancellation criteria: {cancellation_criteria}')
557
+ return managed_jobsv1_pb2.CancelJobsResponse(message=message)
558
+ except Exception as e: # pylint: disable=broad-except
559
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
560
+
561
+ def StreamLogs(
562
+ self,
563
+ request: managed_jobsv1_pb2.
564
+ StreamLogsRequest, # type: ignore[return]
565
+ context: grpc.ServicerContext):
566
+ # TODO(kevin): implement this
567
+ context.abort(grpc.StatusCode.UNIMPLEMENTED,
568
+ 'StreamLogs is not implemented')
sky/skylet/skylet.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """skylet: a daemon running on the head node of a cluster."""
2
2
 
3
+ import argparse
3
4
  import concurrent.futures
4
5
  import os
5
6
  import time
@@ -9,6 +10,9 @@ import grpc
9
10
  import sky
10
11
  from sky import sky_logging
11
12
  from sky.schemas.generated import autostopv1_pb2_grpc
13
+ from sky.schemas.generated import jobsv1_pb2_grpc
14
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
15
+ from sky.schemas.generated import servev1_pb2_grpc
12
16
  from sky.skylet import constants
13
17
  from sky.skylet import events
14
18
  from sky.skylet import services
@@ -44,11 +48,21 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
44
48
  # putting it here for visibility.
45
49
  # TODO(kevin): Determine the optimal max number of threads.
46
50
  max_workers = min(32, (os.cpu_count() or 1) + 4)
51
+ # There's only a single skylet process per cluster, so disable
52
+ # SO_REUSEPORT to raise an error if the port is already in use.
53
+ options = (('grpc.so_reuseport', 0),)
47
54
  server = grpc.server(
48
- concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
55
+ concurrent.futures.ThreadPoolExecutor(max_workers=max_workers),
56
+ options=options)
49
57
 
50
58
  autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
51
59
  services.AutostopServiceImpl(), server)
60
+ jobsv1_pb2_grpc.add_JobsServiceServicer_to_server(
61
+ services.JobsServiceImpl(), server)
62
+ servev1_pb2_grpc.add_ServeServiceServicer_to_server(
63
+ services.ServeServiceImpl(), server)
64
+ managed_jobsv1_pb2_grpc.add_ManagedJobsServiceServicer_to_server(
65
+ services.ManagedJobsServiceImpl(), server)
52
66
 
53
67
  listen_addr = f'127.0.0.1:{port}'
54
68
  server.add_insecure_port(listen_addr)
@@ -62,6 +76,9 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
62
76
  def run_event_loop():
63
77
  """Run the existing event loop."""
64
78
 
79
+ for event in EVENTS:
80
+ event.start()
81
+
65
82
  while True:
66
83
  time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
67
84
  for event in EVENTS:
@@ -69,7 +86,15 @@ def run_event_loop():
69
86
 
70
87
 
71
88
  def main():
72
- grpc_server = start_grpc_server()
89
+ parser = argparse.ArgumentParser(description='Start skylet daemon')
90
+ parser.add_argument('--port',
91
+ type=int,
92
+ default=constants.SKYLET_GRPC_PORT,
93
+ help=f'gRPC port to listen on (default: '
94
+ f'{constants.SKYLET_GRPC_PORT})')
95
+ args = parser.parse_args()
96
+
97
+ grpc_server = start_grpc_server(port=args.port)
73
98
  try:
74
99
  run_event_loop()
75
100
  except KeyboardInterrupt: