skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/logs/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """Sky logging agents."""
2
+ from typing import Optional
3
+
4
+ from sky import exceptions
5
+ from sky import skypilot_config
6
+ from sky.logs.agent import LoggingAgent
7
+ from sky.logs.aws import CloudwatchLoggingAgent
8
+ from sky.logs.gcp import GCPLoggingAgent
9
+
10
+
11
+ def get_logging_agent() -> Optional[LoggingAgent]:
12
+ store = skypilot_config.get_nested(('logs', 'store'), None)
13
+ if store is None:
14
+ return None
15
+ if store == 'gcp':
16
+ return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
17
+ elif store == 'aws':
18
+ return CloudwatchLoggingAgent(
19
+ skypilot_config.get_nested(('logs', 'aws'), {}))
20
+ raise exceptions.InvalidSkyPilotConfigError(
21
+ f'Invalid logging store: {store}')
sky/logs/agent.py ADDED
@@ -0,0 +1,108 @@
1
+ """Base class for all logging agents."""
2
+ import abc
3
+ import os
4
+ import shlex
5
+ from typing import Any, Dict
6
+
7
+ from sky.skylet import constants
8
+ from sky.utils import resources_utils
9
+ from sky.utils import yaml_utils
10
+
11
+
12
+ class LoggingAgent(abc.ABC):
13
+ """Base class for all logging agents.
14
+
15
+ Each agent should implement the `get_setup_command` and
16
+ `get_credential_file_mounts` methods to return the setup command and
17
+ credential file mounts for the agent for provisioner to setup the agent
18
+ on each node.
19
+ """
20
+
21
+ @abc.abstractmethod
22
+ def get_setup_command(self,
23
+ cluster_name: resources_utils.ClusterName) -> str:
24
+ pass
25
+
26
+ @abc.abstractmethod
27
+ def get_credential_file_mounts(self) -> Dict[str, str]:
28
+ pass
29
+
30
+
31
+ class FluentbitAgent(LoggingAgent):
32
+ """Base class for logging store that use fluentbit as the agent."""
33
+
34
+ def get_setup_command(self,
35
+ cluster_name: resources_utils.ClusterName) -> str:
36
+ install_cmd = (
37
+ # pylint: disable=line-too-long
38
+ 'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
39
+ 'sudo apt-get update; sudo apt-get install -y gnupg; '
40
+ # pylint: disable=line-too-long
41
+ 'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
42
+ # pylint: disable=line-too-long
43
+ 'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
44
+ # pylint: disable=line-too-long
45
+ 'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
46
+ # pylint: disable=line-too-long
47
+ 'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
48
+ 'sudo apt-get update; '
49
+ 'sudo apt-get install -y fluent-bit; '
50
+ 'fi')
51
+ cfg = self.fluentbit_config(cluster_name)
52
+ cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
53
+ config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
54
+ f'echo {shlex.quote(cfg)} > {cfg_path}')
55
+ kill_prior_cmd = (
56
+ 'if [ -f "/tmp/fluentbit.pid" ]; then '
57
+ # pylint: disable=line-too-long
58
+ 'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
59
+ 'kill "$(cat /tmp/fluentbit.pid)" || true; '
60
+ 'fi')
61
+ start_cmd = ('nohup $(command -v fluent-bit || '
62
+ 'echo "/opt/fluent-bit/bin/fluent-bit") '
63
+ f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
64
+ 'echo $! > /tmp/fluentbit.pid')
65
+ return ('set -e; '
66
+ f'{install_cmd}; '
67
+ f'{config_cmd}; '
68
+ f'{kill_prior_cmd}; '
69
+ f'{start_cmd}')
70
+
71
+ def fluentbit_config(self,
72
+ cluster_name: resources_utils.ClusterName) -> str:
73
+ cfg_dict = {
74
+ 'parsers': [{
75
+ 'name': 'sky-ray-parser',
76
+ 'format': 'regex',
77
+ # pylint: disable=line-too-long
78
+ 'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
79
+ 'types': 'rank:integer pid:integer',
80
+ }],
81
+ 'pipeline': {
82
+ 'inputs': [{
83
+ 'name': 'tail',
84
+ 'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
85
+ 'path_key': 'log_path',
86
+ # Shorten the refresh interval from 60s to 1s since every
87
+ # job creates a new log file and we must be responsive
88
+ # for this: the VM might be autodown within a minute
89
+ # right after the job completion.
90
+ 'refresh_interval': 1,
91
+ }],
92
+ 'filters': [{
93
+ 'name': 'parser',
94
+ 'match': '*',
95
+ 'key_name': 'log',
96
+ 'parser': 'sky-ray-parser',
97
+ 'preserve_key': 'on', # preserve field for backwards compat
98
+ 'reserve_data': 'on',
99
+ }],
100
+ 'outputs': [self.fluentbit_output_config(cluster_name)],
101
+ }
102
+ }
103
+ return yaml_utils.dump_yaml_str(cfg_dict)
104
+
105
+ @abc.abstractmethod
106
+ def fluentbit_output_config(
107
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
108
+ pass
sky/logs/aws.py ADDED
@@ -0,0 +1,243 @@
1
+ """AWS CloudWatch logging agent."""
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import pydantic
6
+
7
+ from sky.logs.agent import FluentbitAgent
8
+ from sky.utils import resources_utils
9
+ from sky.utils import yaml_utils
10
+
11
+ EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
12
+
13
+
14
+ class _CloudwatchLoggingConfig(pydantic.BaseModel):
15
+ """Configuration for AWS CloudWatch logging agent."""
16
+ region: Optional[str] = None
17
+ credentials_file: Optional[str] = None
18
+ log_group_name: str = 'skypilot-logs'
19
+ log_stream_prefix: str = 'skypilot-'
20
+ auto_create_group: bool = True
21
+ additional_tags: Optional[Dict[str, str]] = None
22
+
23
+
24
+ class _CloudWatchOutputConfig(pydantic.BaseModel):
25
+ """Auxiliary model for building CloudWatch output config in YAML.
26
+
27
+ Ref: https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
28
+ """
29
+ name: str = 'cloudwatch_logs'
30
+ match: str = '*'
31
+ region: Optional[str] = None
32
+ log_group_name: Optional[str] = None
33
+ log_stream_prefix: Optional[str] = None
34
+ auto_create_group: bool = True
35
+ additional_tags: Optional[Dict[str, str]] = None
36
+
37
+ def to_dict(self) -> Dict[str, Any]:
38
+ config = self.model_dump(exclude_none=True)
39
+ if 'auto_create_group' in config:
40
+ config['auto_create_group'] = 'true' if config[
41
+ 'auto_create_group'] else 'false'
42
+ return config
43
+
44
+
45
+ class CloudwatchLoggingAgent(FluentbitAgent):
46
+ """AWS CloudWatch logging agent.
47
+
48
+ This agent forwards logs from SkyPilot clusters to AWS CloudWatch using
49
+ Fluent Bit. It supports authentication via IAM roles (preferred), AWS
50
+ credentials file, or environment variables.
51
+
52
+ Example configuration:
53
+ ```yaml
54
+ logs:
55
+ store: aws
56
+ aws:
57
+ region: us-west-2
58
+ log_group_name: skypilot-logs
59
+ log_stream_prefix: my-cluster-
60
+ auto_create_group: true
61
+ ```
62
+ """
63
+
64
+ def __init__(self, config: Dict[str, Any]):
65
+ """Initialize the CloudWatch logging agent.
66
+
67
+ Args:
68
+ config: The configuration for the CloudWatch logging agent.
69
+ See the class docstring for the expected format.
70
+ """
71
+ self.config = _CloudwatchLoggingConfig(**config)
72
+ super().__init__()
73
+
74
+ def get_setup_command(self,
75
+ cluster_name: resources_utils.ClusterName) -> str:
76
+ """Get the command to set up the CloudWatch logging agent.
77
+
78
+ Args:
79
+ cluster_name: The name of the cluster.
80
+
81
+ Returns:
82
+ The command to set up the CloudWatch logging agent.
83
+ """
84
+
85
+ if self.config.credentials_file:
86
+ credential_path = self.config.credentials_file
87
+
88
+ # Set AWS credentials and check whether credentials are valid.
89
+ # CloudWatch plugin supports IAM roles, credentials file, and
90
+ # environment variables. We prefer IAM roles when available
91
+ # (on EC2 instances). If credentials file is provided, we use
92
+ # it. Otherwise, we check if credentials are available in
93
+ # the environment.
94
+ pre_cmd = ''
95
+ if self.config.credentials_file:
96
+ pre_cmd = (
97
+ f'export AWS_SHARED_CREDENTIALS_FILE={credential_path}; '
98
+ f'if [ ! -f {credential_path} ]; then '
99
+ f'echo "ERROR: AWS credentials file {credential_path} '
100
+ f'not found. Please check if the file exists and is '
101
+ f'accessible." && exit 1; '
102
+ f'fi; '
103
+ f'if ! grep -q "\\[.*\\]" {credential_path} || '
104
+ f'! grep -q "aws_access_key_id" {credential_path}; then '
105
+ f'echo "ERROR: AWS credentials file {credential_path} is '
106
+ f'invalid. It should contain a profile section '
107
+ f'[profile_name] and aws_access_key_id." && exit 1; '
108
+ f'fi;')
109
+ else:
110
+ # Check if we're running on EC2 with an IAM role or if
111
+ # AWS credentials are available in the environment
112
+ pre_cmd = (
113
+ f'if ! curl -s -m 1 {EC2_MD_URL}'
114
+ 'latest/meta-data/iam/security-credentials/ > /dev/null; '
115
+ 'then '
116
+ # failed EC2 check, look for env vars
117
+ 'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
118
+ '[ -z "$AWS_SECRET_ACCESS_KEY" ]; then '
119
+ 'echo "ERROR: AWS CloudWatch logging configuration error. '
120
+ 'Not running on EC2 with IAM role and AWS credentials not '
121
+ 'found in environment. Please do one of the following: '
122
+ '1. Run on an EC2 instance with an IAM role that has '
123
+ 'CloudWatch permissions, 2. Set AWS_ACCESS_KEY_ID and '
124
+ 'AWS_SECRET_ACCESS_KEY environment variables, or '
125
+ '3. Provide a credentials file via logs.aws.credentials_file '
126
+ 'in SkyPilot config." && exit 1; '
127
+ 'fi; '
128
+ 'fi;')
129
+
130
+ # If region is specified, set it in the environment
131
+ if self.config.region:
132
+ pre_cmd += (f' export AWS_REGION={self.config.region}'
133
+ f' AWS_DEFAULT_REGION={self.config.region};'
134
+ ' command -v aws &>/dev/null && '
135
+ f'aws configure set region {self.config.region};')
136
+ else:
137
+ # If region is not specified, check if it's available in
138
+ # the environment or credentials file
139
+ pre_cmd += (
140
+ ' if [ -z "$AWS_REGION" ] && '
141
+ '[ -z "$AWS_DEFAULT_REGION" ]; then '
142
+ 'echo "WARNING: AWS region not specified in configuration or '
143
+ 'environment. CloudWatch logging may fail if the region '
144
+ 'cannot be determined. Consider setting logs.aws.region in '
145
+ 'SkyPilot config."; '
146
+ 'fi; ')
147
+
148
+ # Add a test command to verify AWS credentials work with CloudWatch
149
+ pre_cmd += (
150
+ ' echo "Verifying AWS CloudWatch access..."; '
151
+ 'if command -v aws > /dev/null; then '
152
+ 'aws cloudwatch list-metrics --namespace AWS/Logs --max-items 1 '
153
+ '> /dev/null 2>&1 || '
154
+ '{ echo "ERROR: Failed to access AWS CloudWatch. Please check '
155
+ 'your credentials and permissions."; '
156
+ 'echo "The IAM role or user must have cloudwatch:ListMetrics '
157
+ 'and logs:* permissions."; '
158
+ 'exit 1; }; '
159
+ 'else echo "AWS CLI not installed, skipping CloudWatch access '
160
+ 'verification."; '
161
+ 'fi; ')
162
+
163
+ return pre_cmd + ' ' + super().get_setup_command(cluster_name)
164
+
165
+ def fluentbit_config(self,
166
+ cluster_name: resources_utils.ClusterName) -> str:
167
+ """Get the Fluent Bit configuration for CloudWatch.
168
+
169
+ This overrides the base method to add a fallback output for local file
170
+ logging in case CloudWatch logging fails.
171
+
172
+ Args:
173
+ cluster_name: The name of the cluster.
174
+
175
+ Returns:
176
+ The Fluent Bit configuration as a YAML string.
177
+ """
178
+ cfg_dict = yaml_utils.read_yaml_str(
179
+ super().fluentbit_config(cluster_name))
180
+ display_name = cluster_name.display_name
181
+ unique_name = cluster_name.name_on_cloud
182
+ # Build tags for the log stream
183
+ tags = {
184
+ 'skypilot.cluster_name': display_name,
185
+ 'skypilot.cluster_id': unique_name,
186
+ }
187
+
188
+ # Add additional tags if provided
189
+ if self.config.additional_tags:
190
+ tags.update(self.config.additional_tags)
191
+
192
+ log_processors = []
193
+ for key, value in tags.items():
194
+ log_processors.append({
195
+ 'name': 'content_modifier',
196
+ 'action': 'upsert',
197
+ 'key': key,
198
+ 'value': value
199
+ })
200
+
201
+ # Add log processors to config
202
+ processors_config = cfg_dict['pipeline']['inputs'][0].get(
203
+ 'processors', {})
204
+ processors_logs_config = processors_config.get('logs', [])
205
+ processors_logs_config.extend(log_processors)
206
+ processors_config['logs'] = processors_logs_config
207
+ cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
208
+
209
+ return yaml_utils.dump_yaml_str(cfg_dict)
210
+
211
+ def fluentbit_output_config(
212
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
213
+ """Get the Fluent Bit output configuration for CloudWatch.
214
+
215
+ Args:
216
+ cluster_name: The name of the cluster.
217
+
218
+ Returns:
219
+ The Fluent Bit output configuration for CloudWatch.
220
+ """
221
+ unique_name = cluster_name.name_on_cloud
222
+
223
+ # Format the log stream name to include cluster information
224
+ # This helps with identifying logs in CloudWatch
225
+ log_stream_prefix = f'{self.config.log_stream_prefix}{unique_name}-'
226
+
227
+ # Create the CloudWatch output configuration with error handling options
228
+ return _CloudWatchOutputConfig(
229
+ region=self.config.region,
230
+ log_group_name=self.config.log_group_name,
231
+ log_stream_prefix=log_stream_prefix,
232
+ auto_create_group=self.config.auto_create_group,
233
+ ).to_dict()
234
+
235
+ def get_credential_file_mounts(self) -> Dict[str, str]:
236
+ """Get the credential file mounts for the CloudWatch logging agent.
237
+
238
+ Returns:
239
+ A dictionary mapping local credential file paths to remote paths.
240
+ """
241
+ if self.config.credentials_file:
242
+ return {self.config.credentials_file: self.config.credentials_file}
243
+ return {}
sky/logs/gcp.py ADDED
@@ -0,0 +1,91 @@
1
+ """GCP logging agent."""
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import pydantic
6
+
7
+ from sky.clouds import gcp
8
+ from sky.logs.agent import FluentbitAgent
9
+ from sky.utils import resources_utils
10
+
11
+
12
+ class _GCPLoggingConfig(pydantic.BaseModel):
13
+ """Configuration for GCP logging agent."""
14
+ project_id: Optional[str] = None
15
+ credentials_file: Optional[str] = None
16
+ additional_labels: Optional[Dict[str, str]] = None
17
+
18
+
19
+ class _StackdriverOutputConfig(pydantic.BaseModel):
20
+ """Auxiliary model for building stackdriver output config in YAML.
21
+
22
+ Ref: https://docs.fluentbit.io/manual/1.7/pipeline/outputs/stackdriver
23
+ """
24
+ name: str = 'stackdriver'
25
+ match: str = '*'
26
+ export_to_project_id: Optional[str] = None
27
+ labels: Optional[Dict[str, str]] = None
28
+
29
+ def to_dict(self) -> Dict[str, Any]:
30
+ config = self.model_dump(exclude_none=True)
31
+ if self.labels:
32
+ # Replace the label format from `{k: v}` to `k=v`
33
+ label_str = ','.join([f'{k}={v}' for k, v in self.labels.items()])
34
+ config['labels'] = label_str
35
+ return config
36
+
37
+
38
+ class GCPLoggingAgent(FluentbitAgent):
39
+ """GCP logging agent."""
40
+
41
+ def __init__(self, config: Dict[str, Any]):
42
+ self.config = _GCPLoggingConfig(**config)
43
+
44
+ def get_setup_command(self,
45
+ cluster_name: resources_utils.ClusterName) -> str:
46
+ credential_path = gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
47
+ if self.config.credentials_file:
48
+ credential_path = self.config.credentials_file
49
+ # Set GOOGLE_APPLICATION_CREDENTIALS and check whether credentials
50
+ # is valid.
51
+ # Stackdriver only support service account credentials or credentials
52
+ # from metadata server (only available on GCE or GKE). If the default
53
+ # credentials uploaded by API server is NOT a service account key and
54
+ # there is NO metadata server available, the logging agent will fail to
55
+ # authenticate and we require the user to upload a service account key
56
+ # via logs.gcp.credentials_file in this case.
57
+ # Also note that we use env var instead of YAML config to specify the
58
+ # service account key file path in order to resolve the home directory
59
+ # more reliably.
60
+ # Ref: https://github.com/fluent/fluent-bit/issues/8804
61
+ # TODO(aylei): check whether the credentials config is valid before
62
+ # provision.
63
+ pre_cmd = (f'export GOOGLE_APPLICATION_CREDENTIALS={credential_path}; '
64
+ f'cat {credential_path} | grep "service_account" || '
65
+ f'(echo "Credentials file {credential_path} is not a '
66
+ 'service account key, check metadata server" && '
67
+ 'curl -s http://metadata.google.internal >/dev/null || '
68
+ f'(echo "Neither service account key nor metadata server is '
69
+ 'available. Set logs.gcp.credentials_file to a service '
70
+ 'account key in server config and retry." && '
71
+ 'exit 1;))')
72
+ return pre_cmd + ' && ' + super().get_setup_command(cluster_name)
73
+
74
+ def fluentbit_output_config(
75
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
76
+ display_name = cluster_name.display_name
77
+ unique_name = cluster_name.name_on_cloud
78
+
79
+ return _StackdriverOutputConfig(
80
+ export_to_project_id=self.config.project_id,
81
+ labels={
82
+ 'skypilot_cluster_name': display_name,
83
+ 'skypilot_cluster_id': unique_name,
84
+ **(self.config.additional_labels or {})
85
+ },
86
+ ).to_dict()
87
+
88
+ def get_credential_file_mounts(self) -> Dict[str, str]:
89
+ if self.config.credentials_file:
90
+ return {self.config.credentials_file: self.config.credentials_file}
91
+ return {}
File without changes