skypilot-nightly 1.0.0.dev20250717__py3-none-any.whl → 1.0.0.dev20250720__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show
  1. sky/__init__.py +4 -2
  2. sky/backends/backend_utils.py +23 -13
  3. sky/backends/cloud_vm_ray_backend.py +19 -11
  4. sky/catalog/__init__.py +3 -1
  5. sky/catalog/aws_catalog.py +8 -5
  6. sky/catalog/azure_catalog.py +8 -5
  7. sky/catalog/common.py +8 -2
  8. sky/catalog/cudo_catalog.py +5 -2
  9. sky/catalog/do_catalog.py +4 -1
  10. sky/catalog/fluidstack_catalog.py +5 -2
  11. sky/catalog/gcp_catalog.py +8 -5
  12. sky/catalog/hyperbolic_catalog.py +5 -2
  13. sky/catalog/ibm_catalog.py +8 -5
  14. sky/catalog/lambda_catalog.py +8 -5
  15. sky/catalog/nebius_catalog.py +8 -5
  16. sky/catalog/oci_catalog.py +8 -5
  17. sky/catalog/paperspace_catalog.py +4 -1
  18. sky/catalog/runpod_catalog.py +5 -2
  19. sky/catalog/scp_catalog.py +8 -5
  20. sky/catalog/vast_catalog.py +5 -2
  21. sky/catalog/vsphere_catalog.py +4 -1
  22. sky/client/cli/command.py +25 -2
  23. sky/client/sdk.py +10 -5
  24. sky/clouds/aws.py +12 -7
  25. sky/clouds/azure.py +12 -7
  26. sky/clouds/cloud.py +9 -8
  27. sky/clouds/cudo.py +13 -7
  28. sky/clouds/do.py +12 -7
  29. sky/clouds/fluidstack.py +11 -6
  30. sky/clouds/gcp.py +12 -7
  31. sky/clouds/hyperbolic.py +11 -6
  32. sky/clouds/ibm.py +11 -6
  33. sky/clouds/kubernetes.py +7 -3
  34. sky/clouds/lambda_cloud.py +11 -6
  35. sky/clouds/nebius.py +12 -7
  36. sky/clouds/oci.py +12 -7
  37. sky/clouds/paperspace.py +12 -7
  38. sky/clouds/runpod.py +12 -7
  39. sky/clouds/scp.py +11 -6
  40. sky/clouds/vast.py +12 -7
  41. sky/clouds/vsphere.py +11 -6
  42. sky/core.py +6 -1
  43. sky/dashboard/out/404.html +1 -1
  44. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/1871-a821dcaaae2a3823.js +6 -0
  46. sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.5233e938f14e31a7.js} +1 -1
  47. sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/4869.c7c055a5c2814f33.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/938-63fc419cb82ad9b3.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/9470-8178183f3bae198f.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/{9984.b56614f3c4c5961d.js → 9984.2b5e3fa69171bff9.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/pages/_app-507712f30cd3cec3.js +20 -0
  54. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa406155b4223d0d.js +11 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-14d404b7dd28502a.js → [job]-c5b357bfd9502fbe.js} +1 -1
  56. sky/dashboard/out/_next/static/chunks/webpack-26cdc782eed15a7d.js +1 -0
  57. sky/dashboard/out/_next/static/css/5122cb0a08486fd3.css +3 -0
  58. sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_buildManifest.js +1 -1
  59. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  60. sky/dashboard/out/clusters/[cluster].html +1 -1
  61. sky/dashboard/out/clusters.html +1 -1
  62. sky/dashboard/out/config.html +1 -1
  63. sky/dashboard/out/index.html +1 -1
  64. sky/dashboard/out/infra/[context].html +1 -1
  65. sky/dashboard/out/infra.html +1 -1
  66. sky/dashboard/out/jobs/[job].html +1 -1
  67. sky/dashboard/out/jobs.html +1 -1
  68. sky/dashboard/out/users.html +1 -1
  69. sky/dashboard/out/volumes.html +1 -1
  70. sky/dashboard/out/workspace/new.html +1 -1
  71. sky/dashboard/out/workspaces/[name].html +1 -1
  72. sky/dashboard/out/workspaces.html +1 -1
  73. sky/global_user_state.py +13 -143
  74. sky/jobs/client/sdk.py +1 -1
  75. sky/jobs/server/core.py +14 -0
  76. sky/jobs/state.py +9 -88
  77. sky/jobs/utils.py +28 -13
  78. sky/schemas/db/README +4 -0
  79. sky/schemas/db/env.py +90 -0
  80. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  81. sky/schemas/db/script.py.mako +28 -0
  82. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  83. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  84. sky/serve/client/sdk.py +7 -3
  85. sky/serve/controller.py +7 -3
  86. sky/serve/serve_state.py +1 -1
  87. sky/serve/serve_utils.py +171 -75
  88. sky/serve/server/core.py +17 -6
  89. sky/server/common.py +4 -0
  90. sky/server/requests/payloads.py +2 -0
  91. sky/server/requests/requests.py +1 -1
  92. sky/server/rest.py +71 -26
  93. sky/setup_files/MANIFEST.in +2 -0
  94. sky/setup_files/alembic.ini +152 -0
  95. sky/setup_files/dependencies.py +1 -0
  96. sky/skylet/configs.py +1 -1
  97. sky/skylet/job_lib.py +1 -1
  98. sky/skypilot_config.py +32 -6
  99. sky/users/permission.py +1 -1
  100. sky/utils/common_utils.py +77 -0
  101. sky/utils/db/__init__.py +0 -0
  102. sky/utils/{db_utils.py → db/db_utils.py} +59 -0
  103. sky/utils/db/migration_utils.py +53 -0
  104. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/METADATA +2 -1
  105. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/RECORD +110 -101
  106. sky/dashboard/out/_next/static/chunks/1043-90a88c46f27b3df5.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
  108. sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
  109. sky/dashboard/out/_next/static/chunks/8969-743abf4bc86baf48.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
  113. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-9096ea50b8e2cf9e.js +0 -6
  114. sky/dashboard/out/_next/static/chunks/webpack-c3b45b7b0eaef66f.js +0 -1
  115. sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
  116. /sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,28 @@
1
+ """${message}
2
+
3
+ Revision ID: ${up_revision}
4
+ Revises: ${down_revision | comma,n}
5
+ Create Date: ${create_date}
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+ ${imports if imports else ""}
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = ${repr(up_revision)}
16
+ down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
17
+ branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
18
+ depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
19
+
20
+
21
+ def upgrade() -> None:
22
+ """Upgrade schema."""
23
+ ${upgrades if upgrades else "pass"}
24
+
25
+
26
+ def downgrade() -> None:
27
+ """Downgrade schema."""
28
+ ${downgrades if downgrades else "pass"}
@@ -0,0 +1,30 @@
1
+ """Initial schema for sky config database
2
+
3
+ Revision ID: 001
4
+ Revises:
5
+ Create Date: 2024-01-01 12:00:00.000000
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from alembic import op
10
+
11
+ from sky.skypilot_config import Base
12
+ from sky.utils.db import db_utils
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = '001'
16
+ down_revision = None
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade():
22
+ """Create initial schema for config_yaml table"""
23
+ with op.get_context().autocommit_block():
24
+ # Create all tables with their current schema
25
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
26
+
27
+
28
+ def downgrade():
29
+ """Drop all tables"""
30
+ Base.metadata.drop_all(bind=op.get_bind())
@@ -0,0 +1,97 @@
1
+ """Initial schema for spot jobs database with backwards compatibility columns
2
+
3
+ Revision ID: 001
4
+ Revises:
5
+ Create Date: 2024-01-01 12:00:00.000000
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ import json
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.jobs.state import Base
15
+ from sky.skylet import constants
16
+ from sky.utils.db import db_utils
17
+
18
+ # revision identifiers, used by Alembic.
19
+ revision = '001'
20
+ down_revision = None
21
+ branch_labels = None
22
+ depends_on = None
23
+
24
+
25
+ def upgrade():
26
+ """Create initial schema and add all backwards compatibility columns"""
27
+ with op.get_context().autocommit_block():
28
+ # Create all tables with their current schema
29
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
30
+
31
+ # Add backwards compatibility columns using helper function that matches
32
+ # original add_column_to_table_sqlalchemy behavior exactly
33
+
34
+ # Spot table columns
35
+ db_utils.add_column_to_table_alembic('spot', 'failure_reason',
36
+ sa.Text())
37
+ db_utils.add_column_to_table_alembic('spot',
38
+ 'spot_job_id',
39
+ sa.Integer(),
40
+ copy_from='job_id')
41
+ db_utils.add_column_to_table_alembic(
42
+ 'spot',
43
+ 'task_id',
44
+ sa.Integer(),
45
+ server_default='0',
46
+ value_to_replace_existing_entries=0)
47
+ db_utils.add_column_to_table_alembic('spot',
48
+ 'task_name',
49
+ sa.Text(),
50
+ copy_from='job_name')
51
+ db_utils.add_column_to_table_alembic(
52
+ 'spot',
53
+ 'specs',
54
+ sa.Text(),
55
+ value_to_replace_existing_entries=json.dumps(
56
+ {'max_restarts_on_errors': 0}))
57
+ db_utils.add_column_to_table_alembic('spot', 'local_log_file',
58
+ sa.Text())
59
+ db_utils.add_column_to_table_alembic(
60
+ 'spot',
61
+ 'metadata',
62
+ sa.Text(),
63
+ server_default='{}',
64
+ value_to_replace_existing_entries='{}')
65
+
66
+ # Job info table columns
67
+ db_utils.add_column_to_table_alembic('job_info', 'schedule_state',
68
+ sa.Text())
69
+ db_utils.add_column_to_table_alembic('job_info', 'controller_pid',
70
+ sa.Integer())
71
+ db_utils.add_column_to_table_alembic('job_info', 'dag_yaml_path',
72
+ sa.Text())
73
+ db_utils.add_column_to_table_alembic('job_info', 'env_file_path',
74
+ sa.Text())
75
+ db_utils.add_column_to_table_alembic('job_info', 'user_hash', sa.Text())
76
+ db_utils.add_column_to_table_alembic(
77
+ 'job_info',
78
+ 'workspace',
79
+ sa.Text(),
80
+ value_to_replace_existing_entries=constants.
81
+ SKYPILOT_DEFAULT_WORKSPACE)
82
+ db_utils.add_column_to_table_alembic(
83
+ 'job_info',
84
+ 'priority',
85
+ sa.Integer(),
86
+ server_default=str(constants.DEFAULT_PRIORITY),
87
+ value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
88
+ db_utils.add_column_to_table_alembic('job_info', 'entrypoint',
89
+ sa.Text())
90
+ db_utils.add_column_to_table_alembic('job_info',
91
+ 'original_user_yaml_path',
92
+ sa.Text())
93
+
94
+
95
+ def downgrade():
96
+ """Drop all tables"""
97
+ Base.metadata.drop_all(bind=op.get_bind())
sky/serve/client/sdk.py CHANGED
@@ -292,12 +292,13 @@ def status(
292
292
 
293
293
  @usage_lib.entrypoint
294
294
  @server_common.check_server_healthy_or_start
295
- @rest.retry_on_server_unavailable()
295
+ @rest.retry_transient_errors()
296
296
  def tail_logs(service_name: str,
297
297
  target: Union[str, 'serve_utils.ServiceComponent'],
298
298
  replica_id: Optional[int] = None,
299
299
  follow: bool = True,
300
- output_stream: Optional['io.TextIOBase'] = None) -> None:
300
+ output_stream: Optional['io.TextIOBase'] = None,
301
+ tail: Optional[int] = None) -> None:
301
302
  """Tails logs for a service.
302
303
 
303
304
  Usage:
@@ -367,6 +368,7 @@ def tail_logs(service_name: str,
367
368
  target=target,
368
369
  replica_id=replica_id,
369
370
  follow=follow,
371
+ tail=tail,
370
372
  )
371
373
  response = server_common.make_authenticated_request(
372
374
  'POST',
@@ -390,7 +392,8 @@ def sync_down_logs(service_name: str,
390
392
  str, 'serve_utils.ServiceComponent',
391
393
  List[Union[str,
392
394
  'serve_utils.ServiceComponent']]]] = None,
393
- replica_ids: Optional[List[int]] = None) -> None:
395
+ replica_ids: Optional[List[int]] = None,
396
+ tail: Optional[int] = None) -> None:
394
397
  """Sync down logs from the service components to a local directory.
395
398
 
396
399
  This function syncs logs from the specified service components (controller,
@@ -429,6 +432,7 @@ def sync_down_logs(service_name: str,
429
432
  local_dir=local_dir,
430
433
  targets=targets,
431
434
  replica_ids=replica_ids,
435
+ tail=tail,
432
436
  )
433
437
  response = server_common.make_authenticated_request(
434
438
  'POST',
sky/serve/controller.py CHANGED
@@ -156,9 +156,13 @@ class SkyServeController:
156
156
  return responses.JSONResponse(content={'message': 'Success'},
157
157
  status_code=200)
158
158
  except Exception as e: # pylint: disable=broad-except
159
- logger.error(f'Error in update_service: '
160
- f'{common_utils.format_exception(e)}')
161
- return responses.JSONResponse(content={'message': 'Error'},
159
+ exception_str = common_utils.format_exception(e)
160
+ logger.error(f'Error in update_service: {exception_str}')
161
+ return responses.JSONResponse(content={
162
+ 'message': 'Error',
163
+ 'exception': exception_str,
164
+ 'traceback': traceback.format_exc()
165
+ },
162
166
  status_code=500)
163
167
 
164
168
  @self._app.post('/controller/terminate_replica')
sky/serve/serve_state.py CHANGED
@@ -13,7 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple
13
13
  import colorama
14
14
 
15
15
  from sky.serve import constants
16
- from sky.utils import db_utils
16
+ from sky.utils.db import db_utils
17
17
 
18
18
  if typing.TYPE_CHECKING:
19
19
  from sky.serve import replica_managers
sky/serve/serve_utils.py CHANGED
@@ -12,8 +12,8 @@ import shutil
12
12
  import threading
13
13
  import time
14
14
  import typing
15
- from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
16
- Optional, TextIO, Type, TypeVar, Union)
15
+ from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
16
+ List, Optional, TextIO, Type, TypeVar, Union)
17
17
  import uuid
18
18
 
19
19
  import colorama
@@ -782,6 +782,54 @@ def get_latest_version_with_min_replicas(
782
782
  return active_versions[-1] if active_versions else None
783
783
 
784
784
 
785
+ def _process_line(line: str,
786
+ cluster_name: str,
787
+ stop_on_eof: bool = False) -> Iterator[str]:
788
+ # The line might be directing users to view logs, like
789
+ # `✓ Cluster launched: new-http. View logs at: *.log`
790
+ # We should tail the detailed logs for user.
791
+ def cluster_is_up() -> bool:
792
+ cluster_record = global_user_state.get_cluster_from_name(cluster_name)
793
+ if cluster_record is None:
794
+ return False
795
+ return cluster_record['status'] == status_lib.ClusterStatus.UP
796
+
797
+ provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
798
+ log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
799
+
800
+ if provision_log_prompt is not None:
801
+ nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
802
+
803
+ try:
804
+ with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
805
+ # We still exit if more than 10 seconds without new content
806
+ # to avoid any internal bug that causes the launch to fail
807
+ # while cluster status remains INIT.
808
+ yield from log_utils.follow_logs(f,
809
+ should_stop=cluster_is_up,
810
+ stop_on_eof=stop_on_eof,
811
+ idle_timeout_seconds=10)
812
+ except FileNotFoundError:
813
+ yield line
814
+
815
+ yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
816
+ f'Try to expand log file {nested_log_path} but not '
817
+ f'found. Skipping...{colorama.Style.RESET_ALL}')
818
+ pass
819
+ return
820
+
821
+ if log_prompt is not None:
822
+ # Now we skip other logs (file sync logs) since we lack
823
+ # utility to determine when these log files are finished
824
+ # writing.
825
+ # TODO(tian): We should not skip these logs since there are
826
+ # small chance that error will happen in file sync. Need to
827
+ # find a better way to do this.
828
+ return
829
+
830
+ yield line
831
+
832
+
785
833
  def _follow_logs_with_provision_expanding(
786
834
  file: TextIO,
787
835
  cluster_name: str,
@@ -804,51 +852,8 @@ def _follow_logs_with_provision_expanding(
804
852
  Log lines, including expanded content from referenced provision logs.
805
853
  """
806
854
 
807
- def cluster_is_up() -> bool:
808
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
809
- if cluster_record is None:
810
- return False
811
- return cluster_record['status'] == status_lib.ClusterStatus.UP
812
-
813
855
  def process_line(line: str) -> Iterator[str]:
814
- # The line might be directing users to view logs, like
815
- # `✓ Cluster launched: new-http. View logs at: *.log`
816
- # We should tail the detailed logs for user.
817
- provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
818
- log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
819
-
820
- if provision_log_prompt is not None:
821
- nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
822
-
823
- try:
824
- with open(nested_log_path, 'r', newline='',
825
- encoding='utf-8') as f:
826
- # We still exit if more than 10 seconds without new content
827
- # to avoid any internal bug that causes the launch to fail
828
- # while cluster status remains INIT.
829
- yield from log_utils.follow_logs(f,
830
- should_stop=cluster_is_up,
831
- stop_on_eof=stop_on_eof,
832
- idle_timeout_seconds=10)
833
- except FileNotFoundError:
834
- yield line
835
-
836
- yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
837
- f'Try to expand log file {nested_log_path} but not '
838
- f'found. Skipping...{colorama.Style.RESET_ALL}')
839
- pass
840
- return
841
-
842
- if log_prompt is not None:
843
- # Now we skip other logs (file sync logs) since we lack
844
- # utility to determine when these log files are finished
845
- # writing.
846
- # TODO(tian): We should not skip these logs since there are
847
- # small chance that error will happen in file sync. Need to
848
- # find a better way to do this.
849
- return
850
-
851
- yield line
856
+ yield from _process_line(line, cluster_name, stop_on_eof=stop_on_eof)
852
857
 
853
858
  return log_utils.follow_logs(file,
854
859
  should_stop=should_stop,
@@ -857,18 +862,51 @@ def _follow_logs_with_provision_expanding(
857
862
  idle_timeout_seconds=idle_timeout_seconds)
858
863
 
859
864
 
860
- def stream_replica_logs(service_name: str, replica_id: int,
861
- follow: bool) -> str:
865
+ def _capped_follow_logs_with_provision_expanding(
866
+ log_list: List[str],
867
+ cluster_name: str,
868
+ *,
869
+ line_cap: int = 100,
870
+ ) -> Iterator[str]:
871
+ """Follows logs and expands any provision.log references found.
872
+
873
+ Args:
874
+ log_list: List of Log Lines to read from.
875
+ cluster_name: Name of the cluster being launched.
876
+ line_cap: Number of last lines to return
877
+
878
+ Yields:
879
+ Log lines, including expanded content from referenced provision logs.
880
+ """
881
+ all_lines: Deque[str] = collections.deque(maxlen=line_cap)
882
+
883
+ for line in log_list:
884
+ for processed in _process_line(line=line,
885
+ cluster_name=cluster_name,
886
+ stop_on_eof=False):
887
+ all_lines.append(processed)
888
+
889
+ yield from all_lines
890
+
891
+
892
+ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
893
+ tail: Optional[int]) -> str:
862
894
  msg = check_service_status_healthy(service_name)
863
895
  if msg is not None:
864
896
  return msg
865
897
  print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
866
898
  f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
867
-
868
899
  log_file_name = generate_replica_log_file_name(service_name, replica_id)
869
900
  if os.path.exists(log_file_name):
870
- with open(log_file_name, 'r', encoding='utf-8') as f:
871
- print(f.read(), flush=True)
901
+ if tail is not None:
902
+ lines = common_utils.read_last_n_lines(log_file_name, tail)
903
+ for line in lines:
904
+ if not line.endswith('\n'):
905
+ line += '\n'
906
+ print(line, end='', flush=True)
907
+ else:
908
+ with open(log_file_name, 'r', encoding='utf-8') as f:
909
+ print(f.read(), flush=True)
872
910
  return ''
873
911
 
874
912
  launch_log_file_name = generate_replica_launch_log_file_name(
@@ -891,24 +929,48 @@ def stream_replica_logs(service_name: str, replica_id: int,
891
929
 
892
930
  replica_provisioned = (
893
931
  lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
894
- with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
895
- for line in _follow_logs_with_provision_expanding(
896
- f,
897
- replica_cluster_name,
898
- should_stop=replica_provisioned,
899
- stop_on_eof=not follow,
900
- ):
901
- print(line, end='', flush=True)
932
+
933
+ # Handle launch logs based on number parameter
934
+ final_lines_to_print = []
935
+ if tail is not None:
936
+ static_lines = common_utils.read_last_n_lines(launch_log_file_name,
937
+ tail)
938
+ lines = list(
939
+ _capped_follow_logs_with_provision_expanding(
940
+ log_list=static_lines,
941
+ cluster_name=replica_cluster_name,
942
+ line_cap=tail,
943
+ ))
944
+ final_lines_to_print += lines
945
+ else:
946
+ with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
947
+ for line in _follow_logs_with_provision_expanding(
948
+ f,
949
+ replica_cluster_name,
950
+ should_stop=replica_provisioned,
951
+ stop_on_eof=not follow,
952
+ ):
953
+ print(line, end='', flush=True)
902
954
 
903
955
  if (not follow and
904
956
  _get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
905
957
  # Early exit if not following the logs.
958
+ if tail is not None:
959
+ for line in final_lines_to_print:
960
+ if not line.endswith('\n'):
961
+ line += '\n'
962
+ print(line, end='', flush=True)
906
963
  return ''
907
964
 
908
965
  backend = backends.CloudVmRayBackend()
909
966
  handle = global_user_state.get_handle_from_cluster_name(
910
967
  replica_cluster_name)
911
968
  if handle is None:
969
+ if tail is not None:
970
+ for line in final_lines_to_print:
971
+ if not line.endswith('\n'):
972
+ line += '\n'
973
+ print(line, end='', flush=True)
912
974
  return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)
913
975
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
914
976
 
@@ -917,15 +979,37 @@ def stream_replica_logs(service_name: str, replica_id: int,
917
979
  f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
918
980
 
919
981
  # Always tail the latest logs, which represent user setup & run.
920
- returncode = backend.tail_logs(handle, job_id=None, follow=follow)
921
- if returncode != 0:
922
- return (f'{colorama.Fore.RED}Failed to stream logs for replica '
923
- f'{replica_id}.{colorama.Style.RESET_ALL}')
982
+ if tail is None:
983
+ returncode = backend.tail_logs(handle, job_id=None, follow=follow)
984
+ if returncode != 0:
985
+ return (f'{colorama.Fore.RED}Failed to stream logs for replica '
986
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
987
+ elif not follow and tail > 0:
988
+ final = backend.tail_logs(handle,
989
+ job_id=None,
990
+ follow=follow,
991
+ tail=tail,
992
+ stream_logs=False,
993
+ require_outputs=True,
994
+ process_stream=True)
995
+ if isinstance(final, int) or (final[0] != 0 and final[0] != 101):
996
+ if tail is not None:
997
+ for line in final_lines_to_print:
998
+ if not line.endswith('\n'):
999
+ line += '\n'
1000
+ print(line, end='', flush=True)
1001
+ return (f'{colorama.Fore.RED}Failed to stream logs for replica '
1002
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
1003
+ final_lines_to_print += final[1].splitlines()
1004
+ for line in final_lines_to_print[-tail:]:
1005
+ if not line.endswith('\n'):
1006
+ line += '\n'
1007
+ print(line, end='', flush=True)
924
1008
  return ''
925
1009
 
926
1010
 
927
1011
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
928
- follow: bool) -> str:
1012
+ follow: bool, tail: Optional[int]) -> str:
929
1013
  msg = check_service_status_healthy(service_name)
930
1014
  if msg is not None:
931
1015
  return msg
@@ -940,14 +1024,24 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
940
1024
  return True
941
1025
  return record['status'] in serve_state.ServiceStatus.failed_statuses()
942
1026
 
943
- with open(os.path.expanduser(log_file), 'r', newline='',
944
- encoding='utf-8') as f:
945
- for line in log_utils.follow_logs(
946
- f,
947
- should_stop=_service_is_terminal,
948
- stop_on_eof=not follow,
949
- ):
1027
+ if tail is not None:
1028
+ lines = common_utils.read_last_n_lines(os.path.expanduser(log_file),
1029
+ tail)
1030
+ for line in lines:
1031
+ if not line.endswith('\n'):
1032
+ line += '\n'
950
1033
  print(line, end='', flush=True)
1034
+ else:
1035
+ with open(os.path.expanduser(log_file),
1036
+ 'r',
1037
+ newline='',
1038
+ encoding='utf-8') as f:
1039
+ for line in log_utils.follow_logs(
1040
+ f,
1041
+ should_stop=_service_is_terminal,
1042
+ stop_on_eof=not follow,
1043
+ ):
1044
+ print(line, end='', flush=True)
951
1045
  return ''
952
1046
 
953
1047
 
@@ -1140,20 +1234,22 @@ class ServeCodeGen:
1140
1234
 
1141
1235
  @classmethod
1142
1236
  def stream_replica_logs(cls, service_name: str, replica_id: int,
1143
- follow: bool) -> str:
1237
+ follow: bool, tail: Optional[int]) -> str:
1144
1238
  code = [
1145
1239
  'msg = serve_utils.stream_replica_logs('
1146
- f'{service_name!r}, {replica_id!r}, follow={follow})',
1240
+ f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail})',
1147
1241
  'print(msg, flush=True)'
1148
1242
  ]
1149
1243
  return cls._build(code)
1150
1244
 
1151
1245
  @classmethod
1152
1246
  def stream_serve_process_logs(cls, service_name: str,
1153
- stream_controller: bool, follow: bool) -> str:
1247
+ stream_controller: bool, follow: bool,
1248
+ tail: Optional[int]) -> str:
1154
1249
  code = [
1155
1250
  f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
1156
- f'{stream_controller}, follow={follow})', 'print(msg, flush=True)'
1251
+ f'{stream_controller}, follow={follow}, tail={tail})',
1252
+ 'print(msg, flush=True)'
1157
1253
  ]
1158
1254
  return cls._build(code)
1159
1255
 
sky/serve/server/core.py CHANGED
@@ -740,6 +740,7 @@ def tail_logs(
740
740
  target: ServiceComponentOrStr,
741
741
  replica_id: Optional[int] = None,
742
742
  follow: bool = True,
743
+ tail: Optional[int] = None,
743
744
  ) -> None:
744
745
  """Tails logs for a service.
745
746
 
@@ -805,11 +806,14 @@ def tail_logs(
805
806
  service_name,
806
807
  stream_controller=(
807
808
  target == serve_utils.ServiceComponent.CONTROLLER),
808
- follow=follow)
809
+ follow=follow,
810
+ tail=tail)
809
811
  else:
810
812
  assert replica_id is not None, service_name
811
- code = serve_utils.ServeCodeGen.stream_replica_logs(
812
- service_name, replica_id, follow)
813
+ code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
814
+ replica_id,
815
+ follow,
816
+ tail=tail)
813
817
 
814
818
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
815
819
  # kill the process, so we need to handle it manually here.
@@ -834,6 +838,7 @@ def sync_down_logs(
834
838
  targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
835
839
  None] = None,
836
840
  replica_ids: Optional[List[int]] = None,
841
+ tail: Optional[int] = None,
837
842
  ) -> str:
838
843
  """Sync down logs from the controller for the given service.
839
844
 
@@ -936,16 +941,22 @@ def sync_down_logs(
936
941
  if component == serve_utils.ServiceComponent.CONTROLLER:
937
942
  stream_logs_code = (
938
943
  serve_utils.ServeCodeGen.stream_serve_process_logs(
939
- service_name, stream_controller=True, follow=False))
944
+ service_name,
945
+ stream_controller=True,
946
+ follow=False,
947
+ tail=tail))
940
948
  elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
941
949
  stream_logs_code = (
942
950
  serve_utils.ServeCodeGen.stream_serve_process_logs(
943
- service_name, stream_controller=False, follow=False))
951
+ service_name,
952
+ stream_controller=False,
953
+ follow=False,
954
+ tail=tail))
944
955
  elif component == serve_utils.ServiceComponent.REPLICA:
945
956
  replica_id = target.replica_id
946
957
  assert replica_id is not None, service_name
947
958
  stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
948
- service_name, replica_id, follow=False)
959
+ service_name, replica_id, follow=False, tail=tail)
949
960
  else:
950
961
  assert False, component
951
962
 
sky/server/common.py CHANGED
@@ -371,6 +371,10 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
371
371
 
372
372
 
373
373
  def handle_request_error(response: 'requests.Response') -> None:
374
+ # Keep the original HTTPError if the response code >= 400
375
+ response.raise_for_status()
376
+ # Other status codes are not expected neither, e.g. we do not expect to
377
+ # handle redirection here.
374
378
  if response.status_code != 200:
375
379
  with ux_utils.print_exception_no_traceback():
376
380
  raise RuntimeError(
@@ -557,6 +557,7 @@ class ServeLogsBody(RequestBody):
557
557
  target: Union[str, serve.ServiceComponent]
558
558
  replica_id: Optional[int] = None
559
559
  follow: bool = True
560
+ tail: Optional[int] = None
560
561
 
561
562
 
562
563
  class ServeDownloadLogsBody(RequestBody):
@@ -566,6 +567,7 @@ class ServeDownloadLogsBody(RequestBody):
566
567
  targets: Optional[Union[str, serve.ServiceComponent,
567
568
  List[Union[str, serve.ServiceComponent]]]]
568
569
  replica_ids: Optional[List[int]] = None
570
+ tail: Optional[int] = None
569
571
 
570
572
 
571
573
  class ServeStatusBody(RequestBody):
@@ -29,10 +29,10 @@ from sky.server.requests.serializers import decoders
29
29
  from sky.server.requests.serializers import encoders
30
30
  from sky.utils import common
31
31
  from sky.utils import common_utils
32
- from sky.utils import db_utils
33
32
  from sky.utils import env_options
34
33
  from sky.utils import subprocess_utils
35
34
  from sky.utils import ux_utils
35
+ from sky.utils.db import db_utils
36
36
 
37
37
  logger = sky_logging.init_logger(__name__)
38
38