skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (160) hide show
  1. sky/__init__.py +4 -2
  2. sky/admin_policy.py +11 -4
  3. sky/backends/backend_utils.py +50 -24
  4. sky/backends/cloud_vm_ray_backend.py +41 -38
  5. sky/catalog/__init__.py +3 -1
  6. sky/catalog/aws_catalog.py +8 -5
  7. sky/catalog/azure_catalog.py +8 -5
  8. sky/catalog/common.py +8 -2
  9. sky/catalog/cudo_catalog.py +5 -2
  10. sky/catalog/do_catalog.py +4 -1
  11. sky/catalog/fluidstack_catalog.py +5 -2
  12. sky/catalog/gcp_catalog.py +8 -5
  13. sky/catalog/hyperbolic_catalog.py +5 -2
  14. sky/catalog/ibm_catalog.py +8 -5
  15. sky/catalog/lambda_catalog.py +8 -5
  16. sky/catalog/nebius_catalog.py +8 -5
  17. sky/catalog/oci_catalog.py +8 -5
  18. sky/catalog/paperspace_catalog.py +4 -1
  19. sky/catalog/runpod_catalog.py +5 -2
  20. sky/catalog/scp_catalog.py +8 -5
  21. sky/catalog/vast_catalog.py +5 -2
  22. sky/catalog/vsphere_catalog.py +4 -1
  23. sky/client/cli/command.py +63 -25
  24. sky/client/sdk.py +61 -11
  25. sky/clouds/aws.py +12 -7
  26. sky/clouds/azure.py +12 -7
  27. sky/clouds/cloud.py +9 -8
  28. sky/clouds/cudo.py +13 -7
  29. sky/clouds/do.py +12 -7
  30. sky/clouds/fluidstack.py +11 -6
  31. sky/clouds/gcp.py +12 -7
  32. sky/clouds/hyperbolic.py +11 -6
  33. sky/clouds/ibm.py +11 -6
  34. sky/clouds/kubernetes.py +7 -3
  35. sky/clouds/lambda_cloud.py +11 -6
  36. sky/clouds/nebius.py +14 -12
  37. sky/clouds/oci.py +12 -7
  38. sky/clouds/paperspace.py +12 -7
  39. sky/clouds/runpod.py +12 -7
  40. sky/clouds/scp.py +11 -6
  41. sky/clouds/vast.py +14 -8
  42. sky/clouds/vsphere.py +11 -6
  43. sky/core.py +6 -1
  44. sky/dashboard/out/404.html +1 -1
  45. sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
  47. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
  48. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
  52. sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
  56. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
  57. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
  59. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
  61. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
  62. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
  64. sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
  66. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
  67. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
  68. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
  70. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
  71. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
  73. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
  74. sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
  75. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
  76. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
  77. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
  78. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
  79. sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
  80. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  81. sky/dashboard/out/clusters/[cluster].html +1 -1
  82. sky/dashboard/out/clusters.html +1 -1
  83. sky/dashboard/out/config.html +1 -1
  84. sky/dashboard/out/index.html +1 -1
  85. sky/dashboard/out/infra/[context].html +1 -1
  86. sky/dashboard/out/infra.html +1 -1
  87. sky/dashboard/out/jobs/[job].html +1 -1
  88. sky/dashboard/out/jobs.html +1 -1
  89. sky/dashboard/out/users.html +1 -1
  90. sky/dashboard/out/volumes.html +1 -1
  91. sky/dashboard/out/workspace/new.html +1 -1
  92. sky/dashboard/out/workspaces/[name].html +1 -1
  93. sky/dashboard/out/workspaces.html +1 -1
  94. sky/data/mounting_utils.py +93 -32
  95. sky/global_user_state.py +12 -143
  96. sky/jobs/state.py +9 -88
  97. sky/jobs/utils.py +28 -13
  98. sky/provision/nebius/utils.py +3 -6
  99. sky/schemas/db/README +4 -0
  100. sky/schemas/db/env.py +90 -0
  101. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  102. sky/schemas/db/script.py.mako +28 -0
  103. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  104. sky/serve/client/sdk.py +6 -2
  105. sky/serve/controller.py +7 -3
  106. sky/serve/serve_state.py +1 -1
  107. sky/serve/serve_utils.py +171 -75
  108. sky/serve/server/core.py +17 -6
  109. sky/server/common.py +4 -3
  110. sky/server/requests/payloads.py +2 -0
  111. sky/server/requests/requests.py +1 -1
  112. sky/setup_files/MANIFEST.in +2 -0
  113. sky/setup_files/alembic.ini +148 -0
  114. sky/setup_files/dependencies.py +1 -0
  115. sky/skylet/configs.py +1 -1
  116. sky/skylet/constants.py +4 -0
  117. sky/skylet/job_lib.py +1 -1
  118. sky/skypilot_config.py +1 -1
  119. sky/users/permission.py +1 -1
  120. sky/utils/common_utils.py +85 -3
  121. sky/utils/config_utils.py +15 -0
  122. sky/utils/db/__init__.py +0 -0
  123. sky/utils/{db_utils.py → db/db_utils.py} +59 -0
  124. sky/utils/db/migration_utils.py +93 -0
  125. sky/utils/locks.py +319 -0
  126. sky/utils/schemas.py +38 -34
  127. sky/utils/timeline.py +41 -0
  128. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
  130. sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
  132. sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
  135. sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
  137. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
  139. sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
  140. sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
  142. sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
  143. sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
  146. sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
  147. sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
  155. sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
  156. /sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
  157. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
  158. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
  159. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
  160. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/219887b94512388c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/219887b94512388c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6b0575ea521af4f3.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-771a40cde532309b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js" defer=""></script><script src="/dashboard/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/FUjweqdImyeYhMYFON-Se/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"FUjweqdImyeYhMYFON-Se","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-a305898dc479711e.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js" defer=""></script><script src="/dashboard/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"mym3Ciwp-zqU7ZpOLGnrW","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -39,19 +39,32 @@ _GOOFYS_WRAPPER = ('$(if [ -S /dev/log ] ; then '
39
39
 
40
40
 
41
41
  def get_s3_mount_install_cmd() -> str:
42
- """Returns a command to install S3 mount utility goofys."""
42
+ """Returns command for basic S3 mounting (goofys by default, rclone for
43
+ ARM64)."""
43
44
  # TODO(aylei): maintain our goofys fork under skypilot-org
44
- install_cmd = ('ARCH=$(uname -m) && '
45
- 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
46
- ' echo "goofys is not supported on $ARCH" && '
47
- f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
48
- 'else '
49
- ' ARCH_SUFFIX="amd64"; '
50
- 'fi && '
51
- 'sudo wget -nc https://github.com/aylei/goofys/'
52
- 'releases/download/0.24.0-aylei-upstream/goofys '
53
- '-O /usr/local/bin/goofys && '
54
- 'sudo chmod 755 /usr/local/bin/goofys')
45
+ install_cmd = (
46
+ 'ARCH=$(uname -m) && '
47
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
48
+ # Use rclone for ARM64 since goofys doesn't support it
49
+ # Extract core rclone installation logic without redundant ARCH check
50
+ ' ARCH_SUFFIX="arm" && '
51
+ f' (which dpkg > /dev/null 2>&1 && (which rclone > /dev/null || '
52
+ f'(cd ~ > /dev/null && curl -O https://downloads.rclone.org/'
53
+ f'{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb '
54
+ f'&& sudo dpkg -i rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb '
55
+ f'&& rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb))) || '
56
+ f'(which rclone > /dev/null || (cd ~ > /dev/null && curl -O '
57
+ f'https://downloads.rclone.org/{RCLONE_VERSION}/'
58
+ f'rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm && '
59
+ f'sudo yum --nogpgcheck install '
60
+ f'rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm -y && '
61
+ f'rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm)); '
62
+ 'else '
63
+ ' sudo wget -nc https://github.com/aylei/goofys/'
64
+ 'releases/download/0.24.0-aylei-upstream/goofys '
65
+ '-O /usr/local/bin/goofys && '
66
+ 'sudo chmod 755 /usr/local/bin/goofys; '
67
+ 'fi')
55
68
  return install_cmd
56
69
 
57
70
 
@@ -59,15 +72,30 @@ def get_s3_mount_install_cmd() -> str:
59
72
  def get_s3_mount_cmd(bucket_name: str,
60
73
  mount_path: str,
61
74
  _bucket_sub_path: Optional[str] = None) -> str:
62
- """Returns a command to mount an S3 bucket using goofys."""
75
+ """Returns a command to mount an S3 bucket (goofys by default, rclone for
76
+ ARM64)"""
63
77
  if _bucket_sub_path is None:
64
78
  _bucket_sub_path = ''
65
79
  else:
66
80
  _bucket_sub_path = f':{_bucket_sub_path}'
67
- mount_cmd = (f'{_GOOFYS_WRAPPER} -o allow_other '
68
- f'--stat-cache-ttl {_STAT_CACHE_TTL} '
69
- f'--type-cache-ttl {_TYPE_CACHE_TTL} '
70
- f'{bucket_name}{_bucket_sub_path} {mount_path}')
81
+
82
+ # Use rclone for ARM64 architectures since goofys doesn't support them
83
+ arch_check = 'ARCH=$(uname -m) && '
84
+ rclone_mount = (
85
+ f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
86
+ f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
87
+ '--daemon --allow-other')
88
+ goofys_mount = (f'{_GOOFYS_WRAPPER} -o allow_other '
89
+ f'--stat-cache-ttl {_STAT_CACHE_TTL} '
90
+ f'--type-cache-ttl {_TYPE_CACHE_TTL} '
91
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
92
+
93
+ mount_cmd = (f'{arch_check}'
94
+ f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
95
+ f' {rclone_mount}; '
96
+ f'else '
97
+ f' {goofys_mount}; '
98
+ f'fi')
71
99
  return mount_cmd
72
100
 
73
101
 
@@ -76,17 +104,33 @@ def get_nebius_mount_cmd(nebius_profile_name: str,
76
104
  endpoint_url: str,
77
105
  mount_path: str,
78
106
  _bucket_sub_path: Optional[str] = None) -> str:
79
- """Returns a command to install Nebius mount utility goofys."""
107
+ """Returns a command to mount Nebius bucket (goofys by default, rclone for
108
+ ARM64)."""
80
109
  if _bucket_sub_path is None:
81
110
  _bucket_sub_path = ''
82
111
  else:
83
112
  _bucket_sub_path = f':{_bucket_sub_path}'
84
- mount_cmd = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
85
- '-o allow_other '
86
- f'--stat-cache-ttl {_STAT_CACHE_TTL} '
87
- f'--type-cache-ttl {_TYPE_CACHE_TTL} '
88
- f'--endpoint {endpoint_url} '
89
- f'{bucket_name}{_bucket_sub_path} {mount_path}')
113
+
114
+ # Use rclone for ARM64 architectures since goofys doesn't support them
115
+ arch_check = 'ARCH=$(uname -m) && '
116
+ rclone_mount = (
117
+ f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
118
+ f'AWS_PROFILE={nebius_profile_name} '
119
+ f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
120
+ f'--s3-endpoint {endpoint_url} --daemon --allow-other')
121
+ goofys_mount = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
122
+ '-o allow_other '
123
+ f'--stat-cache-ttl {_STAT_CACHE_TTL} '
124
+ f'--type-cache-ttl {_TYPE_CACHE_TTL} '
125
+ f'--endpoint {endpoint_url} '
126
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
127
+
128
+ mount_cmd = (f'{arch_check}'
129
+ f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
130
+ f' {rclone_mount}; '
131
+ f'else '
132
+ f' {goofys_mount}; '
133
+ f'fi')
90
134
  return mount_cmd
91
135
 
92
136
 
@@ -236,18 +280,35 @@ def get_r2_mount_cmd(r2_credentials_path: str,
236
280
  bucket_name: str,
237
281
  mount_path: str,
238
282
  _bucket_sub_path: Optional[str] = None) -> str:
239
- """Returns a command to install R2 mount utility goofys."""
283
+ """Returns a command to mount R2 bucket (goofys by default, rclone for
284
+ ARM64)."""
240
285
  if _bucket_sub_path is None:
241
286
  _bucket_sub_path = ''
242
287
  else:
243
288
  _bucket_sub_path = f':{_bucket_sub_path}'
244
- mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
245
- f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
246
- '-o allow_other '
247
- f'--stat-cache-ttl {_STAT_CACHE_TTL} '
248
- f'--type-cache-ttl {_TYPE_CACHE_TTL} '
249
- f'--endpoint {endpoint_url} '
250
- f'{bucket_name}{_bucket_sub_path} {mount_path}')
289
+
290
+ # Use rclone for ARM64 architectures since goofys doesn't support them
291
+ arch_check = 'ARCH=$(uname -m) && '
292
+ rclone_mount = (
293
+ f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
294
+ f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
295
+ f'AWS_PROFILE={r2_profile_name} '
296
+ f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
297
+ f'--s3-endpoint {endpoint_url} --daemon --allow-other')
298
+ goofys_mount = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
299
+ f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
300
+ '-o allow_other '
301
+ f'--stat-cache-ttl {_STAT_CACHE_TTL} '
302
+ f'--type-cache-ttl {_TYPE_CACHE_TTL} '
303
+ f'--endpoint {endpoint_url} '
304
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
305
+
306
+ mount_cmd = (f'{arch_check}'
307
+ f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
308
+ f' {rclone_mount}; '
309
+ f'else '
310
+ f' {goofys_mount}; '
311
+ f'fi')
251
312
  return mount_cmd
252
313
 
253
314
 
sky/global_user_state.py CHANGED
@@ -12,7 +12,6 @@ import os
12
12
  import pathlib
13
13
  import pickle
14
14
  import re
15
- import threading
16
15
  import time
17
16
  import typing
18
17
  from typing import Any, Dict, List, Optional, Set, Tuple
@@ -32,9 +31,10 @@ from sky import skypilot_config
32
31
  from sky.skylet import constants
33
32
  from sky.utils import common_utils
34
33
  from sky.utils import context_utils
35
- from sky.utils import db_utils
36
34
  from sky.utils import registry
37
35
  from sky.utils import status_lib
36
+ from sky.utils.db import db_utils
37
+ from sky.utils.db import migration_utils
38
38
 
39
39
  if typing.TYPE_CHECKING:
40
40
  from sky import backends
@@ -48,7 +48,6 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
48
48
  _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
49
49
 
50
50
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
51
- _DB_INIT_LOCK = threading.Lock()
52
51
 
53
52
  Base = declarative.declarative_base()
54
53
 
@@ -238,152 +237,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
238
237
  # If the database is locked, it is OK to continue, as the WAL mode
239
238
  # is not critical and is likely to be enabled by other processes.
240
239
 
241
- # Create tables if they don't exist
242
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
243
-
244
- # For backward compatibility.
245
- # TODO(zhwu): Remove this function after all users have migrated to
246
- # the latest version of SkyPilot.
247
- with orm.Session(engine) as session:
248
- # Add autostop column to clusters table
249
- db_utils.add_column_to_table_sqlalchemy(session,
250
- 'clusters',
251
- 'autostop',
252
- sqlalchemy.Integer(),
253
- default_statement='DEFAULT -1')
254
-
255
- db_utils.add_column_to_table_sqlalchemy(
256
- session,
257
- 'clusters',
258
- 'metadata',
259
- sqlalchemy.Text(),
260
- default_statement='DEFAULT \'{}\'')
261
-
262
- db_utils.add_column_to_table_sqlalchemy(session,
263
- 'clusters',
264
- 'to_down',
265
- sqlalchemy.Integer(),
266
- default_statement='DEFAULT 0')
267
-
268
- # The cloud identity that created the cluster.
269
- db_utils.add_column_to_table_sqlalchemy(
270
- session,
271
- 'clusters',
272
- 'owner',
273
- sqlalchemy.Text(),
274
- default_statement='DEFAULT NULL')
275
-
276
- db_utils.add_column_to_table_sqlalchemy(
277
- session,
278
- 'clusters',
279
- 'cluster_hash',
280
- sqlalchemy.Text(),
281
- default_statement='DEFAULT NULL')
282
-
283
- db_utils.add_column_to_table_sqlalchemy(
284
- session,
285
- 'clusters',
286
- 'storage_mounts_metadata',
287
- sqlalchemy.LargeBinary(),
288
- default_statement='DEFAULT NULL')
289
- db_utils.add_column_to_table_sqlalchemy(
290
- session,
291
- 'clusters',
292
- 'cluster_ever_up',
293
- sqlalchemy.Integer(),
294
- default_statement='DEFAULT 0',
295
- # Set the value to 1 so that all the existing clusters before #2977
296
- # are considered as ever up, i.e:
297
- # existing cluster's default (null) -> 1;
298
- # new cluster's default -> 0;
299
- # This is conservative for the existing clusters: even if some INIT
300
- # clusters were never really UP, setting it to 1 means they won't be
301
- # auto-deleted during any failover.
302
- value_to_replace_existing_entries=1)
303
- db_utils.add_column_to_table_sqlalchemy(
304
- session,
305
- 'clusters',
306
- 'status_updated_at',
307
- sqlalchemy.Integer(),
308
- default_statement='DEFAULT NULL')
309
- db_utils.add_column_to_table_sqlalchemy(
310
- session,
311
- 'clusters',
312
- 'user_hash',
313
- sqlalchemy.Text(),
314
- default_statement='DEFAULT NULL',
315
- value_to_replace_existing_entries=common_utils.get_current_user(
316
- ).id)
317
- db_utils.add_column_to_table_sqlalchemy(
318
- session,
319
- 'clusters',
320
- 'config_hash',
321
- sqlalchemy.Text(),
322
- default_statement='DEFAULT NULL')
323
-
324
- db_utils.add_column_to_table_sqlalchemy(
325
- session,
326
- 'cluster_history',
327
- 'user_hash',
328
- sqlalchemy.Text(),
329
- default_statement='DEFAULT NULL')
330
-
331
- db_utils.add_column_to_table_sqlalchemy(
332
- session,
333
- 'clusters',
334
- 'workspace',
335
- sqlalchemy.Text(),
336
- default_statement='DEFAULT \'default\'',
337
- value_to_replace_existing_entries=constants.
338
- SKYPILOT_DEFAULT_WORKSPACE)
339
- db_utils.add_column_to_table_sqlalchemy(
340
- session,
341
- 'clusters',
342
- 'last_creation_yaml',
343
- sqlalchemy.Text(),
344
- default_statement='DEFAULT NULL',
345
- )
346
- db_utils.add_column_to_table_sqlalchemy(
347
- session,
348
- 'clusters',
349
- 'last_creation_command',
350
- sqlalchemy.Text(),
351
- default_statement='DEFAULT NULL')
352
- db_utils.add_column_to_table_sqlalchemy(
353
- session,
354
- 'users',
355
- 'password',
356
- sqlalchemy.Text(),
357
- default_statement='DEFAULT NULL')
358
- db_utils.add_column_to_table_sqlalchemy(
359
- session,
360
- 'users',
361
- 'created_at',
362
- sqlalchemy.Integer(),
363
- default_statement='DEFAULT NULL')
364
-
365
- db_utils.add_column_to_table_sqlalchemy(
366
- session,
367
- 'cluster_history',
368
- 'last_creation_yaml',
369
- sqlalchemy.Text(),
370
- default_statement='DEFAULT NULL')
371
-
372
- db_utils.add_column_to_table_sqlalchemy(
373
- session,
374
- 'cluster_history',
375
- 'last_creation_command',
376
- sqlalchemy.Text(),
377
- default_statement='DEFAULT NULL')
378
-
379
- session.commit()
240
+ # Get alembic config for state db and run migrations
241
+ alembic_config = migration_utils.get_alembic_config(
242
+ engine, migration_utils.GLOBAL_USER_STATE_DB_NAME)
243
+ # pylint: disable=line-too-long
244
+ alembic_config.config_ini_section = migration_utils.GLOBAL_USER_STATE_DB_NAME
245
+ migration_utils.safe_alembic_upgrade(
246
+ engine, alembic_config, migration_utils.GLOBAL_USER_STATE_VERSION)
380
247
 
381
248
 
382
249
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
383
250
  global _SQLALCHEMY_ENGINE
384
251
  if _SQLALCHEMY_ENGINE is not None:
385
252
  return _SQLALCHEMY_ENGINE
386
- with _DB_INIT_LOCK:
253
+ with migration_utils.db_lock(migration_utils.GLOBAL_USER_STATE_DB_NAME):
387
254
  if _SQLALCHEMY_ENGINE is None:
388
255
  conn_string = None
389
256
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
@@ -520,6 +387,7 @@ def get_user(user_id: str) -> Optional[models.User]:
520
387
  created_at=row.created_at)
521
388
 
522
389
 
390
+ @_init_db
523
391
  def get_user_by_name(username: str) -> List[models.User]:
524
392
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
525
393
  rows = session.query(user_table).filter_by(name=username).all()
@@ -533,6 +401,7 @@ def get_user_by_name(username: str) -> List[models.User]:
533
401
  ]
534
402
 
535
403
 
404
+ @_init_db
536
405
  def delete_user(user_id: str) -> None:
537
406
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
538
407
  session.query(user_table).filter_by(id=user_id).delete()
sky/jobs/state.py CHANGED
@@ -6,7 +6,6 @@ import functools
6
6
  import json
7
7
  import os
8
8
  import pathlib
9
- import threading
10
9
  import time
11
10
  import typing
12
11
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -24,7 +23,8 @@ from sky import sky_logging
24
23
  from sky import skypilot_config
25
24
  from sky.skylet import constants
26
25
  from sky.utils import common_utils
27
- from sky.utils import db_utils
26
+ from sky.utils.db import db_utils
27
+ from sky.utils.db import migration_utils
28
28
 
29
29
  if typing.TYPE_CHECKING:
30
30
  from sqlalchemy.engine import row
@@ -36,7 +36,6 @@ CallbackType = Callable[[str], None]
36
36
  logger = sky_logging.init_logger(__name__)
37
37
 
38
38
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
39
- _DB_INIT_LOCK = threading.Lock()
40
39
 
41
40
  Base = declarative.declarative_base()
42
41
 
@@ -130,97 +129,19 @@ def create_table(engine: sqlalchemy.engine.Engine):
130
129
  # If the database is locked, it is OK to continue, as the WAL mode
131
130
  # is not critical and is likely to be enabled by other processes.
132
131
 
133
- # Create tables if they don't exist
134
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
135
-
136
- # Backward compatibility: add columns that not exist in older databases
137
- with orm.Session(engine) as session:
138
- db_utils.add_column_to_table_sqlalchemy(session, 'spot',
139
- 'failure_reason',
140
- sqlalchemy.Text())
141
- db_utils.add_column_to_table_sqlalchemy(session,
142
- 'spot',
143
- 'spot_job_id',
144
- sqlalchemy.Integer(),
145
- copy_from='job_id')
146
- db_utils.add_column_to_table_sqlalchemy(
147
- session,
148
- 'spot',
149
- 'task_id',
150
- sqlalchemy.Integer(),
151
- default_statement='DEFAULT 0',
152
- value_to_replace_existing_entries=0)
153
- db_utils.add_column_to_table_sqlalchemy(session,
154
- 'spot',
155
- 'task_name',
156
- sqlalchemy.Text(),
157
- copy_from='job_name')
158
- db_utils.add_column_to_table_sqlalchemy(
159
- session,
160
- 'spot',
161
- 'specs',
162
- sqlalchemy.Text(),
163
- value_to_replace_existing_entries=json.dumps({
164
- 'max_restarts_on_errors': 0,
165
- }))
166
- db_utils.add_column_to_table_sqlalchemy(
167
- session,
168
- 'spot',
169
- 'local_log_file',
170
- sqlalchemy.Text(),
171
- default_statement='DEFAULT NULL')
172
-
173
- db_utils.add_column_to_table_sqlalchemy(
174
- session,
175
- 'spot',
176
- 'metadata',
177
- sqlalchemy.Text(),
178
- default_statement='DEFAULT \'{}\'',
179
- value_to_replace_existing_entries='{}')
180
-
181
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
182
- 'schedule_state',
183
- sqlalchemy.Text())
184
- db_utils.add_column_to_table_sqlalchemy(
185
- session,
186
- 'job_info',
187
- 'controller_pid',
188
- sqlalchemy.Integer(),
189
- default_statement='DEFAULT NULL')
190
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
191
- 'dag_yaml_path',
192
- sqlalchemy.Text())
193
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
194
- 'env_file_path',
195
- sqlalchemy.Text())
196
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
197
- 'user_hash', sqlalchemy.Text())
198
- db_utils.add_column_to_table_sqlalchemy(
199
- session,
200
- 'job_info',
201
- 'workspace',
202
- sqlalchemy.Text(),
203
- default_statement='DEFAULT NULL',
204
- value_to_replace_existing_entries='default')
205
- db_utils.add_column_to_table_sqlalchemy(
206
- session,
207
- 'job_info',
208
- 'priority',
209
- sqlalchemy.Integer(),
210
- value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
211
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
212
- 'entrypoint', sqlalchemy.Text())
213
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
214
- 'original_user_yaml_path',
215
- sqlalchemy.Text())
216
- session.commit()
132
+ # Get alembic config for spot jobs db and run migrations
133
+ alembic_config = migration_utils.get_alembic_config(
134
+ engine, migration_utils.SPOT_JOBS_DB_NAME)
135
+ alembic_config.config_ini_section = migration_utils.SPOT_JOBS_DB_NAME
136
+ migration_utils.safe_alembic_upgrade(engine, alembic_config,
137
+ migration_utils.SPOT_JOBS_VERSION)
217
138
 
218
139
 
219
140
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
220
141
  global _SQLALCHEMY_ENGINE
221
142
  if _SQLALCHEMY_ENGINE is not None:
222
143
  return _SQLALCHEMY_ENGINE
223
- with _DB_INIT_LOCK:
144
+ with migration_utils.db_lock(migration_utils.SPOT_JOBS_DB_NAME):
224
145
  if _SQLALCHEMY_ENGINE is None:
225
146
  conn_string = None
226
147
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
sky/jobs/utils.py CHANGED
@@ -67,6 +67,9 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
67
67
 
68
68
  _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
69
69
 
70
+ _JOB_STATUS_FETCH_MAX_RETRIES = 3
71
+ _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
72
+
70
73
  _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
71
74
  'Waiting for task to start[/]'
72
75
  '{status_str}. It may take a few minutes.\n'
@@ -250,19 +253,31 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
250
253
  logger.info(f'Cluster {cluster_name} not found.')
251
254
  return None
252
255
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
253
- status = None
254
- try:
255
- logger.info('=== Checking the job status... ===')
256
- statuses = backend.get_job_status(handle, stream_logs=False)
257
- status = list(statuses.values())[0]
258
- if status is None:
259
- logger.info('No job found.')
260
- else:
261
- logger.info(f'Job status: {status}')
262
- except exceptions.CommandError:
263
- logger.info('Failed to connect to the cluster.')
264
- logger.info('=' * 34)
265
- return status
256
+ for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
257
+ try:
258
+ logger.info('=== Checking the job status... ===')
259
+ statuses = backend.get_job_status(handle, stream_logs=False)
260
+ status = list(statuses.values())[0]
261
+ if status is None:
262
+ logger.info('No job found.')
263
+ else:
264
+ logger.info(f'Job status: {status}')
265
+ logger.info('=' * 34)
266
+ return status
267
+ except exceptions.CommandError as e:
268
+ # Retry on k8s transient network errors. This is useful when using
269
+ # coreweave which may have transient network issue sometimes.
270
+ if (e.detailed_reason is not None and
271
+ _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
272
+ logger.info('Failed to connect to the cluster. Retrying '
273
+ f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
274
+ logger.info('=' * 34)
275
+ time.sleep(1)
276
+ else:
277
+ logger.info(f'Failed to get job status: {e.detailed_reason}')
278
+ logger.info('=' * 34)
279
+ return None
280
+ return None
266
281
 
267
282
 
268
283
  def _controller_process_alive(pid: int, job_id: int) -> bool:
@@ -41,10 +41,7 @@ def get_project_by_region(region: str) -> str:
41
41
 
42
42
  # Check is there project if in config
43
43
  project_id = skypilot_config.get_effective_region_config(
44
- cloud='nebius',
45
- region=None,
46
- keys=(region, 'project_id'),
47
- default_value=None)
44
+ cloud='nebius', region=region, keys=('project_id',), default_value=None)
48
45
  if project_id is not None:
49
46
  return project_id
50
47
  for project in projects.items:
@@ -189,8 +186,8 @@ def launch(cluster_name_on_cloud: str,
189
186
  if preset == '8gpu-128vcpu-1600gb':
190
187
  fabric = skypilot_config.get_effective_region_config(
191
188
  cloud='nebius',
192
- region=None,
193
- keys=(region, 'fabric'),
189
+ region=region,
190
+ keys=('fabric',),
194
191
  default_value=None)
195
192
 
196
193
  # Auto-select fabric if network_tier=best and no fabric configured
sky/schemas/db/README ADDED
@@ -0,0 +1,4 @@
1
+ Migrations for sqlalchemy databases. Currently includes:
2
+ global_user_state
3
+ spot_jobs (managed jobs state)
4
+ skypilot_config