skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (160) hide show
  1. sky/__init__.py +4 -2
  2. sky/admin_policy.py +11 -4
  3. sky/backends/backend_utils.py +50 -24
  4. sky/backends/cloud_vm_ray_backend.py +41 -38
  5. sky/catalog/__init__.py +3 -1
  6. sky/catalog/aws_catalog.py +8 -5
  7. sky/catalog/azure_catalog.py +8 -5
  8. sky/catalog/common.py +8 -2
  9. sky/catalog/cudo_catalog.py +5 -2
  10. sky/catalog/do_catalog.py +4 -1
  11. sky/catalog/fluidstack_catalog.py +5 -2
  12. sky/catalog/gcp_catalog.py +8 -5
  13. sky/catalog/hyperbolic_catalog.py +5 -2
  14. sky/catalog/ibm_catalog.py +8 -5
  15. sky/catalog/lambda_catalog.py +8 -5
  16. sky/catalog/nebius_catalog.py +8 -5
  17. sky/catalog/oci_catalog.py +8 -5
  18. sky/catalog/paperspace_catalog.py +4 -1
  19. sky/catalog/runpod_catalog.py +5 -2
  20. sky/catalog/scp_catalog.py +8 -5
  21. sky/catalog/vast_catalog.py +5 -2
  22. sky/catalog/vsphere_catalog.py +4 -1
  23. sky/client/cli/command.py +63 -25
  24. sky/client/sdk.py +61 -11
  25. sky/clouds/aws.py +12 -7
  26. sky/clouds/azure.py +12 -7
  27. sky/clouds/cloud.py +9 -8
  28. sky/clouds/cudo.py +13 -7
  29. sky/clouds/do.py +12 -7
  30. sky/clouds/fluidstack.py +11 -6
  31. sky/clouds/gcp.py +12 -7
  32. sky/clouds/hyperbolic.py +11 -6
  33. sky/clouds/ibm.py +11 -6
  34. sky/clouds/kubernetes.py +7 -3
  35. sky/clouds/lambda_cloud.py +11 -6
  36. sky/clouds/nebius.py +14 -12
  37. sky/clouds/oci.py +12 -7
  38. sky/clouds/paperspace.py +12 -7
  39. sky/clouds/runpod.py +12 -7
  40. sky/clouds/scp.py +11 -6
  41. sky/clouds/vast.py +14 -8
  42. sky/clouds/vsphere.py +11 -6
  43. sky/core.py +6 -1
  44. sky/dashboard/out/404.html +1 -1
  45. sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
  47. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
  48. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
  52. sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
  56. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
  57. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
  59. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
  61. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
  62. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
  64. sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
  66. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
  67. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
  68. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
  70. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
  71. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
  73. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
  74. sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
  75. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
  76. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
  77. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
  78. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
  79. sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
  80. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  81. sky/dashboard/out/clusters/[cluster].html +1 -1
  82. sky/dashboard/out/clusters.html +1 -1
  83. sky/dashboard/out/config.html +1 -1
  84. sky/dashboard/out/index.html +1 -1
  85. sky/dashboard/out/infra/[context].html +1 -1
  86. sky/dashboard/out/infra.html +1 -1
  87. sky/dashboard/out/jobs/[job].html +1 -1
  88. sky/dashboard/out/jobs.html +1 -1
  89. sky/dashboard/out/users.html +1 -1
  90. sky/dashboard/out/volumes.html +1 -1
  91. sky/dashboard/out/workspace/new.html +1 -1
  92. sky/dashboard/out/workspaces/[name].html +1 -1
  93. sky/dashboard/out/workspaces.html +1 -1
  94. sky/data/mounting_utils.py +93 -32
  95. sky/global_user_state.py +12 -143
  96. sky/jobs/state.py +9 -88
  97. sky/jobs/utils.py +28 -13
  98. sky/provision/nebius/utils.py +3 -6
  99. sky/schemas/db/README +4 -0
  100. sky/schemas/db/env.py +90 -0
  101. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  102. sky/schemas/db/script.py.mako +28 -0
  103. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  104. sky/serve/client/sdk.py +6 -2
  105. sky/serve/controller.py +7 -3
  106. sky/serve/serve_state.py +1 -1
  107. sky/serve/serve_utils.py +171 -75
  108. sky/serve/server/core.py +17 -6
  109. sky/server/common.py +4 -3
  110. sky/server/requests/payloads.py +2 -0
  111. sky/server/requests/requests.py +1 -1
  112. sky/setup_files/MANIFEST.in +2 -0
  113. sky/setup_files/alembic.ini +148 -0
  114. sky/setup_files/dependencies.py +1 -0
  115. sky/skylet/configs.py +1 -1
  116. sky/skylet/constants.py +4 -0
  117. sky/skylet/job_lib.py +1 -1
  118. sky/skypilot_config.py +1 -1
  119. sky/users/permission.py +1 -1
  120. sky/utils/common_utils.py +85 -3
  121. sky/utils/config_utils.py +15 -0
  122. sky/utils/db/__init__.py +0 -0
  123. sky/utils/{db_utils.py → db/db_utils.py} +59 -0
  124. sky/utils/db/migration_utils.py +93 -0
  125. sky/utils/locks.py +319 -0
  126. sky/utils/schemas.py +38 -34
  127. sky/utils/timeline.py +41 -0
  128. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
  130. sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
  132. sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
  135. sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
  137. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
  139. sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
  140. sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
  142. sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
  143. sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
  146. sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
  147. sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
  155. sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
  156. /sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
  157. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
  158. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
  159. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
  160. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -12,8 +12,8 @@ import shutil
12
12
  import threading
13
13
  import time
14
14
  import typing
15
- from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
16
- Optional, TextIO, Type, TypeVar, Union)
15
+ from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
16
+ List, Optional, TextIO, Type, TypeVar, Union)
17
17
  import uuid
18
18
 
19
19
  import colorama
@@ -782,6 +782,54 @@ def get_latest_version_with_min_replicas(
782
782
  return active_versions[-1] if active_versions else None
783
783
 
784
784
 
785
+ def _process_line(line: str,
786
+ cluster_name: str,
787
+ stop_on_eof: bool = False) -> Iterator[str]:
788
+ # The line might be directing users to view logs, like
789
+ # `✓ Cluster launched: new-http. View logs at: *.log`
790
+ # We should tail the detailed logs for user.
791
+ def cluster_is_up() -> bool:
792
+ cluster_record = global_user_state.get_cluster_from_name(cluster_name)
793
+ if cluster_record is None:
794
+ return False
795
+ return cluster_record['status'] == status_lib.ClusterStatus.UP
796
+
797
+ provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
798
+ log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
799
+
800
+ if provision_log_prompt is not None:
801
+ nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
802
+
803
+ try:
804
+ with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
805
+ # We still exit if more than 10 seconds without new content
806
+ # to avoid any internal bug that causes the launch to fail
807
+ # while cluster status remains INIT.
808
+ yield from log_utils.follow_logs(f,
809
+ should_stop=cluster_is_up,
810
+ stop_on_eof=stop_on_eof,
811
+ idle_timeout_seconds=10)
812
+ except FileNotFoundError:
813
+ yield line
814
+
815
+ yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
816
+ f'Try to expand log file {nested_log_path} but not '
817
+ f'found. Skipping...{colorama.Style.RESET_ALL}')
818
+ pass
819
+ return
820
+
821
+ if log_prompt is not None:
822
+ # Now we skip other logs (file sync logs) since we lack
823
+ # utility to determine when these log files are finished
824
+ # writing.
825
+ # TODO(tian): We should not skip these logs since there are
826
+ # small chance that error will happen in file sync. Need to
827
+ # find a better way to do this.
828
+ return
829
+
830
+ yield line
831
+
832
+
785
833
  def _follow_logs_with_provision_expanding(
786
834
  file: TextIO,
787
835
  cluster_name: str,
@@ -804,51 +852,8 @@ def _follow_logs_with_provision_expanding(
804
852
  Log lines, including expanded content from referenced provision logs.
805
853
  """
806
854
 
807
- def cluster_is_up() -> bool:
808
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
809
- if cluster_record is None:
810
- return False
811
- return cluster_record['status'] == status_lib.ClusterStatus.UP
812
-
813
855
  def process_line(line: str) -> Iterator[str]:
814
- # The line might be directing users to view logs, like
815
- # `✓ Cluster launched: new-http. View logs at: *.log`
816
- # We should tail the detailed logs for user.
817
- provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
818
- log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
819
-
820
- if provision_log_prompt is not None:
821
- nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
822
-
823
- try:
824
- with open(nested_log_path, 'r', newline='',
825
- encoding='utf-8') as f:
826
- # We still exit if more than 10 seconds without new content
827
- # to avoid any internal bug that causes the launch to fail
828
- # while cluster status remains INIT.
829
- yield from log_utils.follow_logs(f,
830
- should_stop=cluster_is_up,
831
- stop_on_eof=stop_on_eof,
832
- idle_timeout_seconds=10)
833
- except FileNotFoundError:
834
- yield line
835
-
836
- yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
837
- f'Try to expand log file {nested_log_path} but not '
838
- f'found. Skipping...{colorama.Style.RESET_ALL}')
839
- pass
840
- return
841
-
842
- if log_prompt is not None:
843
- # Now we skip other logs (file sync logs) since we lack
844
- # utility to determine when these log files are finished
845
- # writing.
846
- # TODO(tian): We should not skip these logs since there are
847
- # small chance that error will happen in file sync. Need to
848
- # find a better way to do this.
849
- return
850
-
851
- yield line
856
+ yield from _process_line(line, cluster_name, stop_on_eof=stop_on_eof)
852
857
 
853
858
  return log_utils.follow_logs(file,
854
859
  should_stop=should_stop,
@@ -857,18 +862,51 @@ def _follow_logs_with_provision_expanding(
857
862
  idle_timeout_seconds=idle_timeout_seconds)
858
863
 
859
864
 
860
- def stream_replica_logs(service_name: str, replica_id: int,
861
- follow: bool) -> str:
865
+ def _capped_follow_logs_with_provision_expanding(
866
+ log_list: List[str],
867
+ cluster_name: str,
868
+ *,
869
+ line_cap: int = 100,
870
+ ) -> Iterator[str]:
871
+ """Follows logs and expands any provision.log references found.
872
+
873
+ Args:
874
+ log_list: List of Log Lines to read from.
875
+ cluster_name: Name of the cluster being launched.
876
+ line_cap: Number of last lines to return
877
+
878
+ Yields:
879
+ Log lines, including expanded content from referenced provision logs.
880
+ """
881
+ all_lines: Deque[str] = collections.deque(maxlen=line_cap)
882
+
883
+ for line in log_list:
884
+ for processed in _process_line(line=line,
885
+ cluster_name=cluster_name,
886
+ stop_on_eof=False):
887
+ all_lines.append(processed)
888
+
889
+ yield from all_lines
890
+
891
+
892
+ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
893
+ tail: Optional[int]) -> str:
862
894
  msg = check_service_status_healthy(service_name)
863
895
  if msg is not None:
864
896
  return msg
865
897
  print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
866
898
  f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
867
-
868
899
  log_file_name = generate_replica_log_file_name(service_name, replica_id)
869
900
  if os.path.exists(log_file_name):
870
- with open(log_file_name, 'r', encoding='utf-8') as f:
871
- print(f.read(), flush=True)
901
+ if tail is not None:
902
+ lines = common_utils.read_last_n_lines(log_file_name, tail)
903
+ for line in lines:
904
+ if not line.endswith('\n'):
905
+ line += '\n'
906
+ print(line, end='', flush=True)
907
+ else:
908
+ with open(log_file_name, 'r', encoding='utf-8') as f:
909
+ print(f.read(), flush=True)
872
910
  return ''
873
911
 
874
912
  launch_log_file_name = generate_replica_launch_log_file_name(
@@ -891,24 +929,48 @@ def stream_replica_logs(service_name: str, replica_id: int,
891
929
 
892
930
  replica_provisioned = (
893
931
  lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
894
- with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
895
- for line in _follow_logs_with_provision_expanding(
896
- f,
897
- replica_cluster_name,
898
- should_stop=replica_provisioned,
899
- stop_on_eof=not follow,
900
- ):
901
- print(line, end='', flush=True)
932
+
933
+ # Handle launch logs based on number parameter
934
+ final_lines_to_print = []
935
+ if tail is not None:
936
+ static_lines = common_utils.read_last_n_lines(launch_log_file_name,
937
+ tail)
938
+ lines = list(
939
+ _capped_follow_logs_with_provision_expanding(
940
+ log_list=static_lines,
941
+ cluster_name=replica_cluster_name,
942
+ line_cap=tail,
943
+ ))
944
+ final_lines_to_print += lines
945
+ else:
946
+ with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
947
+ for line in _follow_logs_with_provision_expanding(
948
+ f,
949
+ replica_cluster_name,
950
+ should_stop=replica_provisioned,
951
+ stop_on_eof=not follow,
952
+ ):
953
+ print(line, end='', flush=True)
902
954
 
903
955
  if (not follow and
904
956
  _get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
905
957
  # Early exit if not following the logs.
958
+ if tail is not None:
959
+ for line in final_lines_to_print:
960
+ if not line.endswith('\n'):
961
+ line += '\n'
962
+ print(line, end='', flush=True)
906
963
  return ''
907
964
 
908
965
  backend = backends.CloudVmRayBackend()
909
966
  handle = global_user_state.get_handle_from_cluster_name(
910
967
  replica_cluster_name)
911
968
  if handle is None:
969
+ if tail is not None:
970
+ for line in final_lines_to_print:
971
+ if not line.endswith('\n'):
972
+ line += '\n'
973
+ print(line, end='', flush=True)
912
974
  return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)
913
975
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
914
976
 
@@ -917,15 +979,37 @@ def stream_replica_logs(service_name: str, replica_id: int,
917
979
  f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
918
980
 
919
981
  # Always tail the latest logs, which represent user setup & run.
920
- returncode = backend.tail_logs(handle, job_id=None, follow=follow)
921
- if returncode != 0:
922
- return (f'{colorama.Fore.RED}Failed to stream logs for replica '
923
- f'{replica_id}.{colorama.Style.RESET_ALL}')
982
+ if tail is None:
983
+ returncode = backend.tail_logs(handle, job_id=None, follow=follow)
984
+ if returncode != 0:
985
+ return (f'{colorama.Fore.RED}Failed to stream logs for replica '
986
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
987
+ elif not follow and tail > 0:
988
+ final = backend.tail_logs(handle,
989
+ job_id=None,
990
+ follow=follow,
991
+ tail=tail,
992
+ stream_logs=False,
993
+ require_outputs=True,
994
+ process_stream=True)
995
+ if isinstance(final, int) or (final[0] != 0 and final[0] != 101):
996
+ if tail is not None:
997
+ for line in final_lines_to_print:
998
+ if not line.endswith('\n'):
999
+ line += '\n'
1000
+ print(line, end='', flush=True)
1001
+ return (f'{colorama.Fore.RED}Failed to stream logs for replica '
1002
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
1003
+ final_lines_to_print += final[1].splitlines()
1004
+ for line in final_lines_to_print[-tail:]:
1005
+ if not line.endswith('\n'):
1006
+ line += '\n'
1007
+ print(line, end='', flush=True)
924
1008
  return ''
925
1009
 
926
1010
 
927
1011
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
928
- follow: bool) -> str:
1012
+ follow: bool, tail: Optional[int]) -> str:
929
1013
  msg = check_service_status_healthy(service_name)
930
1014
  if msg is not None:
931
1015
  return msg
@@ -940,14 +1024,24 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
940
1024
  return True
941
1025
  return record['status'] in serve_state.ServiceStatus.failed_statuses()
942
1026
 
943
- with open(os.path.expanduser(log_file), 'r', newline='',
944
- encoding='utf-8') as f:
945
- for line in log_utils.follow_logs(
946
- f,
947
- should_stop=_service_is_terminal,
948
- stop_on_eof=not follow,
949
- ):
1027
+ if tail is not None:
1028
+ lines = common_utils.read_last_n_lines(os.path.expanduser(log_file),
1029
+ tail)
1030
+ for line in lines:
1031
+ if not line.endswith('\n'):
1032
+ line += '\n'
950
1033
  print(line, end='', flush=True)
1034
+ else:
1035
+ with open(os.path.expanduser(log_file),
1036
+ 'r',
1037
+ newline='',
1038
+ encoding='utf-8') as f:
1039
+ for line in log_utils.follow_logs(
1040
+ f,
1041
+ should_stop=_service_is_terminal,
1042
+ stop_on_eof=not follow,
1043
+ ):
1044
+ print(line, end='', flush=True)
951
1045
  return ''
952
1046
 
953
1047
 
@@ -1140,20 +1234,22 @@ class ServeCodeGen:
1140
1234
 
1141
1235
  @classmethod
1142
1236
  def stream_replica_logs(cls, service_name: str, replica_id: int,
1143
- follow: bool) -> str:
1237
+ follow: bool, tail: Optional[int]) -> str:
1144
1238
  code = [
1145
1239
  'msg = serve_utils.stream_replica_logs('
1146
- f'{service_name!r}, {replica_id!r}, follow={follow})',
1240
+ f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail})',
1147
1241
  'print(msg, flush=True)'
1148
1242
  ]
1149
1243
  return cls._build(code)
1150
1244
 
1151
1245
  @classmethod
1152
1246
  def stream_serve_process_logs(cls, service_name: str,
1153
- stream_controller: bool, follow: bool) -> str:
1247
+ stream_controller: bool, follow: bool,
1248
+ tail: Optional[int]) -> str:
1154
1249
  code = [
1155
1250
  f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
1156
- f'{stream_controller}, follow={follow})', 'print(msg, flush=True)'
1251
+ f'{stream_controller}, follow={follow}, tail={tail})',
1252
+ 'print(msg, flush=True)'
1157
1253
  ]
1158
1254
  return cls._build(code)
1159
1255
 
sky/serve/server/core.py CHANGED
@@ -740,6 +740,7 @@ def tail_logs(
740
740
  target: ServiceComponentOrStr,
741
741
  replica_id: Optional[int] = None,
742
742
  follow: bool = True,
743
+ tail: Optional[int] = None,
743
744
  ) -> None:
744
745
  """Tails logs for a service.
745
746
 
@@ -805,11 +806,14 @@ def tail_logs(
805
806
  service_name,
806
807
  stream_controller=(
807
808
  target == serve_utils.ServiceComponent.CONTROLLER),
808
- follow=follow)
809
+ follow=follow,
810
+ tail=tail)
809
811
  else:
810
812
  assert replica_id is not None, service_name
811
- code = serve_utils.ServeCodeGen.stream_replica_logs(
812
- service_name, replica_id, follow)
813
+ code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
814
+ replica_id,
815
+ follow,
816
+ tail=tail)
813
817
 
814
818
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
815
819
  # kill the process, so we need to handle it manually here.
@@ -834,6 +838,7 @@ def sync_down_logs(
834
838
  targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
835
839
  None] = None,
836
840
  replica_ids: Optional[List[int]] = None,
841
+ tail: Optional[int] = None,
837
842
  ) -> str:
838
843
  """Sync down logs from the controller for the given service.
839
844
 
@@ -936,16 +941,22 @@ def sync_down_logs(
936
941
  if component == serve_utils.ServiceComponent.CONTROLLER:
937
942
  stream_logs_code = (
938
943
  serve_utils.ServeCodeGen.stream_serve_process_logs(
939
- service_name, stream_controller=True, follow=False))
944
+ service_name,
945
+ stream_controller=True,
946
+ follow=False,
947
+ tail=tail))
940
948
  elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
941
949
  stream_logs_code = (
942
950
  serve_utils.ServeCodeGen.stream_serve_process_logs(
943
- service_name, stream_controller=False, follow=False))
951
+ service_name,
952
+ stream_controller=False,
953
+ follow=False,
954
+ tail=tail))
944
955
  elif component == serve_utils.ServiceComponent.REPLICA:
945
956
  replica_id = target.replica_id
946
957
  assert replica_id is not None, service_name
947
958
  stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
948
- service_name, replica_id, follow=False)
959
+ service_name, replica_id, follow=False, tail=tail)
949
960
  else:
950
961
  assert False, component
951
962
 
sky/server/common.py CHANGED
@@ -252,8 +252,9 @@ def get_dashboard_url(server_url: str,
252
252
 
253
253
 
254
254
  @annotations.lru_cache(scope='global')
255
- def is_api_server_local():
256
- return get_server_url() in AVAILABLE_LOCAL_API_SERVER_URLS
255
+ def is_api_server_local(endpoint: Optional[str] = None):
256
+ server_url = endpoint if endpoint is not None else get_server_url()
257
+ return server_url in AVAILABLE_LOCAL_API_SERVER_URLS
257
258
 
258
259
 
259
260
  def _handle_non_200_server_status(
@@ -566,7 +567,7 @@ def check_server_healthy(
566
567
  api_server_status = api_server_info.status
567
568
  if api_server_status == ApiServerStatus.VERSION_MISMATCH:
568
569
  msg = api_server_info.error
569
- if is_api_server_local():
570
+ if is_api_server_local(endpoint):
570
571
  # For local server, just hint user to restart the server to get
571
572
  # a consistent version.
572
573
  msg = _LOCAL_API_SERVER_RESTART_HINT
@@ -557,6 +557,7 @@ class ServeLogsBody(RequestBody):
557
557
  target: Union[str, serve.ServiceComponent]
558
558
  replica_id: Optional[int] = None
559
559
  follow: bool = True
560
+ tail: Optional[int] = None
560
561
 
561
562
 
562
563
  class ServeDownloadLogsBody(RequestBody):
@@ -566,6 +567,7 @@ class ServeDownloadLogsBody(RequestBody):
566
567
  targets: Optional[Union[str, serve.ServiceComponent,
567
568
  List[Union[str, serve.ServiceComponent]]]]
568
569
  replica_ids: Optional[List[int]] = None
570
+ tail: Optional[int] = None
569
571
 
570
572
 
571
573
  class ServeStatusBody(RequestBody):
@@ -29,10 +29,10 @@ from sky.server.requests.serializers import decoders
29
29
  from sky.server.requests.serializers import encoders
30
30
  from sky.utils import common
31
31
  from sky.utils import common_utils
32
- from sky.utils import db_utils
33
32
  from sky.utils import env_options
34
33
  from sky.utils import subprocess_utils
35
34
  from sky.utils import ux_utils
35
+ from sky.utils.db import db_utils
36
36
 
37
37
  logger = sky_logging.init_logger(__name__)
38
38
 
@@ -18,3 +18,5 @@ include sky/server/html/*
18
18
  recursive-include sky/dashboard/out *
19
19
  include sky/users/*.conf
20
20
  include sky/utils/*.sh
21
+ include sky/setup_files/alembic.ini
22
+ recursive-include sky/schemas/db *
@@ -0,0 +1,148 @@
1
+ # alembic configuration for global user state, jobs state, and sky config db migrations.
2
+
3
+ [DEFAULT]
4
+ # path to migration scripts.
5
+ # this is typically a path given in POSIX (e.g. forward slashes)
6
+ # format, relative to the token %(here)s which refers to the location of this
7
+ # ini file
8
+ script_location = %(here)s/../schemas/db
9
+
10
+ # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
11
+ # Uncomment the line below if you want the files to be prepended with date and time
12
+ # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
13
+ # for all available tokens
14
+ # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
15
+
16
+ # sys.path path, will be prepended to sys.path if present.
17
+ # defaults to the current working directory. for multiple paths, the path separator
18
+ # is defined by "path_separator" below.
19
+ prepend_sys_path = .
20
+
21
+
22
+ # timezone to use when rendering the date within the migration file
23
+ # as well as the filename.
24
+ # If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
25
+ # Any required deps can installed by adding `alembic[tz]` to the pip requirements
26
+ # string value is passed to ZoneInfo()
27
+ # leave blank for localtime
28
+ # timezone =
29
+
30
+ # max length of characters to apply to the "slug" field
31
+ # truncate_slug_length = 40
32
+
33
+ # set to 'true' to run the environment during
34
+ # the 'revision' command, regardless of autogenerate
35
+ # revision_environment = false
36
+
37
+ # set to 'true' to allow .pyc and .pyo files without
38
+ # a source .py file to be detected as revisions in the
39
+ # versions/ directory
40
+ # sourceless = false
41
+
42
+ # version location specification; This defaults
43
+ # to <script_location>/versions. When using multiple version
44
+ # directories, initial revisions must be specified with --version-path.
45
+ # The path separator used here should be the separator specified by "path_separator"
46
+ # below.
47
+ # version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions
48
+
49
+ # path_separator; This indicates what character is used to split lists of file
50
+ # paths, including version_locations and prepend_sys_path within configparser
51
+ # files such as alembic.ini.
52
+ # The default rendered in new alembic.ini files is "os", which uses os.pathsep
53
+ # to provide os-dependent path splitting.
54
+ #
55
+ # Note that in order to support legacy alembic.ini files, this default does NOT
56
+ # take place if path_separator is not present in alembic.ini. If this
57
+ # option is omitted entirely, fallback logic is as follows:
58
+ #
59
+ # 1. Parsing of the version_locations option falls back to using the legacy
60
+ # "version_path_separator" key, which if absent then falls back to the legacy
61
+ # behavior of splitting on spaces and/or commas.
62
+ # 2. Parsing of the prepend_sys_path option falls back to the legacy
63
+ # behavior of splitting on spaces, commas, or colons.
64
+ #
65
+ # Valid values for path_separator are:
66
+ #
67
+ # path_separator = :
68
+ # path_separator = ;
69
+ # path_separator = space
70
+ # path_separator = newline
71
+ #
72
+ # Use os.pathsep. Default configuration used for new projects.
73
+ path_separator = os
74
+
75
+ # set to 'true' to search source files recursively
76
+ # in each "version_locations" directory
77
+ # new in Alembic version 1.10
78
+ # recursive_version_locations = false
79
+
80
+ # the output encoding used when revision files
81
+ # are written from script.py.mako
82
+ # output_encoding = utf-8
83
+
84
+ # database URL. This is consumed by the user-maintained env.py script only.
85
+ # other means of configuring database URLs may be customized within the env.py
86
+ # file.
87
+ # sqlalchemy.url = driver://user:pass@localhost/dbname
88
+
89
+ [state_db]
90
+ version_locations = %(here)s/../schemas/db/global_user_state
91
+ version_table = alembic_version_state_db
92
+
93
+ [spot_jobs_db]
94
+ version_locations = %(here)s/../schemas/db/spot_jobs
95
+ version_table = alembic_version_spot_jobs_db
96
+
97
+ [post_write_hooks]
98
+ # post_write_hooks defines scripts or Python functions that are run
99
+ # on newly generated revision scripts. See the documentation for further
100
+ # detail and examples
101
+
102
+ # format using "black" - use the console_scripts runner, against the "black" entrypoint
103
+ # hooks = black
104
+ # black.type = console_scripts
105
+ # black.entrypoint = black
106
+ # black.options = -l 79 REVISION_SCRIPT_FILENAME
107
+
108
+ # lint with attempts to fix using "ruff" - use the exec runner, execute a binary
109
+ # hooks = ruff
110
+ # ruff.type = exec
111
+ # ruff.executable = %(here)s/.venv/bin/ruff
112
+ # ruff.options = check --fix REVISION_SCRIPT_FILENAME
113
+
114
+ # Logging configuration. This is also consumed by the user-maintained
115
+ # env.py script only.
116
+ [loggers]
117
+ keys = root,sqlalchemy,alembic
118
+
119
+ [handlers]
120
+ keys = console
121
+
122
+ [formatters]
123
+ keys = generic
124
+
125
+ [logger_root]
126
+ level = WARNING
127
+ handlers = console
128
+ qualname =
129
+
130
+ [logger_sqlalchemy]
131
+ level = WARNING
132
+ handlers =
133
+ qualname = sqlalchemy.engine
134
+
135
+ [logger_alembic]
136
+ level = WARNING
137
+ handlers =
138
+ qualname = alembic
139
+
140
+ [handler_console]
141
+ class = StreamHandler
142
+ args = (sys.stderr,)
143
+ level = NOTSET
144
+ formatter = generic
145
+
146
+ [formatter_generic]
147
+ format = %(levelname)-5.5s [%(name)s] %(message)s
148
+ datefmt = %H:%M:%S
@@ -68,6 +68,7 @@ install_requires = [
68
68
  'pyjwt',
69
69
  'gitpython',
70
70
  'types-paramiko',
71
+ 'alembic',
71
72
  ]
72
73
 
73
74
  server_dependencies = [
sky/skylet/configs.py CHANGED
@@ -5,7 +5,7 @@ import pathlib
5
5
  import threading
6
6
  from typing import Callable, Optional, Union
7
7
 
8
- from sky.utils import db_utils
8
+ from sky.utils.db import db_utils
9
9
 
10
10
  _DB_PATH = None
11
11
  _db_init_lock = threading.Lock()
sky/skylet/constants.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Constants for SkyPilot."""
2
+ import os
2
3
  from typing import List, Tuple
3
4
 
4
5
  from packaging import version
@@ -491,3 +492,6 @@ DEFAULT_PRIORITY = 0
491
492
 
492
493
  GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
493
494
  COST_REPORT_DEFAULT_DAYS = 30
495
+
496
+ # The directory for file locks.
497
+ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
sky/skylet/job_lib.py CHANGED
@@ -24,10 +24,10 @@ from sky import sky_logging
24
24
  from sky.adaptors import common as adaptors_common
25
25
  from sky.skylet import constants
26
26
  from sky.utils import common_utils
27
- from sky.utils import db_utils
28
27
  from sky.utils import log_utils
29
28
  from sky.utils import message_utils
30
29
  from sky.utils import subprocess_utils
30
+ from sky.utils.db import db_utils
31
31
 
32
32
  if typing.TYPE_CHECKING:
33
33
  import psutil
sky/skypilot_config.py CHANGED
@@ -73,9 +73,9 @@ from sky.skylet import constants
73
73
  from sky.utils import common_utils
74
74
  from sky.utils import config_utils
75
75
  from sky.utils import context
76
- from sky.utils import db_utils
77
76
  from sky.utils import schemas
78
77
  from sky.utils import ux_utils
78
+ from sky.utils.db import db_utils
79
79
  from sky.utils.kubernetes import config_map_utils
80
80
 
81
81
  if typing.TYPE_CHECKING:
sky/users/permission.py CHANGED
@@ -15,7 +15,7 @@ from sky import sky_logging
15
15
  from sky.skylet import constants
16
16
  from sky.users import rbac
17
17
  from sky.utils import common_utils
18
- from sky.utils import db_utils
18
+ from sky.utils.db import db_utils
19
19
 
20
20
  logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
21
21
  logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)