skypilot-nightly 1.0.0.dev20250717__py3-none-any.whl → 1.0.0.dev20250720__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +23 -13
- sky/backends/cloud_vm_ray_backend.py +19 -11
- sky/catalog/__init__.py +3 -1
- sky/catalog/aws_catalog.py +8 -5
- sky/catalog/azure_catalog.py +8 -5
- sky/catalog/common.py +8 -2
- sky/catalog/cudo_catalog.py +5 -2
- sky/catalog/do_catalog.py +4 -1
- sky/catalog/fluidstack_catalog.py +5 -2
- sky/catalog/gcp_catalog.py +8 -5
- sky/catalog/hyperbolic_catalog.py +5 -2
- sky/catalog/ibm_catalog.py +8 -5
- sky/catalog/lambda_catalog.py +8 -5
- sky/catalog/nebius_catalog.py +8 -5
- sky/catalog/oci_catalog.py +8 -5
- sky/catalog/paperspace_catalog.py +4 -1
- sky/catalog/runpod_catalog.py +5 -2
- sky/catalog/scp_catalog.py +8 -5
- sky/catalog/vast_catalog.py +5 -2
- sky/catalog/vsphere_catalog.py +4 -1
- sky/client/cli/command.py +25 -2
- sky/client/sdk.py +10 -5
- sky/clouds/aws.py +12 -7
- sky/clouds/azure.py +12 -7
- sky/clouds/cloud.py +9 -8
- sky/clouds/cudo.py +13 -7
- sky/clouds/do.py +12 -7
- sky/clouds/fluidstack.py +11 -6
- sky/clouds/gcp.py +12 -7
- sky/clouds/hyperbolic.py +11 -6
- sky/clouds/ibm.py +11 -6
- sky/clouds/kubernetes.py +7 -3
- sky/clouds/lambda_cloud.py +11 -6
- sky/clouds/nebius.py +12 -7
- sky/clouds/oci.py +12 -7
- sky/clouds/paperspace.py +12 -7
- sky/clouds/runpod.py +12 -7
- sky/clouds/scp.py +11 -6
- sky/clouds/vast.py +12 -7
- sky/clouds/vsphere.py +11 -6
- sky/core.py +6 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-a821dcaaae2a3823.js +6 -0
- sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.5233e938f14e31a7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4869.c7c055a5c2814f33.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-63fc419cb82ad9b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9470-8178183f3bae198f.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9984.b56614f3c4c5961d.js → 9984.2b5e3fa69171bff9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-507712f30cd3cec3.js +20 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa406155b4223d0d.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-14d404b7dd28502a.js → [job]-c5b357bfd9502fbe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-26cdc782eed15a7d.js +1 -0
- sky/dashboard/out/_next/static/css/5122cb0a08486fd3.css +3 -0
- sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +13 -143
- sky/jobs/client/sdk.py +1 -1
- sky/jobs/server/core.py +14 -0
- sky/jobs/state.py +9 -88
- sky/jobs/utils.py +28 -13
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/serve/client/sdk.py +7 -3
- sky/serve/controller.py +7 -3
- sky/serve/serve_state.py +1 -1
- sky/serve/serve_utils.py +171 -75
- sky/serve/server/core.py +17 -6
- sky/server/common.py +4 -0
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +1 -1
- sky/server/rest.py +71 -26
- sky/setup_files/MANIFEST.in +2 -0
- sky/setup_files/alembic.ini +152 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/configs.py +1 -1
- sky/skylet/job_lib.py +1 -1
- sky/skypilot_config.py +32 -6
- sky/users/permission.py +1 -1
- sky/utils/common_utils.py +77 -0
- sky/utils/db/__init__.py +0 -0
- sky/utils/{db_utils.py → db/db_utils.py} +59 -0
- sky/utils/db/migration_utils.py +53 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/RECORD +110 -101
- sky/dashboard/out/_next/static/chunks/1043-90a88c46f27b3df5.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
- sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-743abf4bc86baf48.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
- sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-9096ea50b8e2cf9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/webpack-c3b45b7b0eaef66f.js +0 -1
- sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
- /sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""${message}
|
|
2
|
+
|
|
3
|
+
Revision ID: ${up_revision}
|
|
4
|
+
Revises: ${down_revision | comma,n}
|
|
5
|
+
Create Date: ${create_date}
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from typing import Sequence, Union
|
|
9
|
+
|
|
10
|
+
from alembic import op
|
|
11
|
+
import sqlalchemy as sa
|
|
12
|
+
${imports if imports else ""}
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision: str = ${repr(up_revision)}
|
|
16
|
+
down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
|
|
17
|
+
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
|
18
|
+
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
"""Upgrade schema."""
|
|
23
|
+
${upgrades if upgrades else "pass"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def downgrade() -> None:
|
|
27
|
+
"""Downgrade schema."""
|
|
28
|
+
${downgrades if downgrades else "pass"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Initial schema for sky config database
|
|
2
|
+
|
|
3
|
+
Revision ID: 001
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2024-01-01 12:00:00.000000
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from alembic import op
|
|
10
|
+
|
|
11
|
+
from sky.skypilot_config import Base
|
|
12
|
+
from sky.utils.db import db_utils
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = '001'
|
|
16
|
+
down_revision = None
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade():
|
|
22
|
+
"""Create initial schema for config_yaml table"""
|
|
23
|
+
with op.get_context().autocommit_block():
|
|
24
|
+
# Create all tables with their current schema
|
|
25
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade():
|
|
29
|
+
"""Drop all tables"""
|
|
30
|
+
Base.metadata.drop_all(bind=op.get_bind())
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Initial schema for spot jobs database with backwards compatibility columns
|
|
2
|
+
|
|
3
|
+
Revision ID: 001
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2024-01-01 12:00:00.000000
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.jobs.state import Base
|
|
15
|
+
from sky.skylet import constants
|
|
16
|
+
from sky.utils.db import db_utils
|
|
17
|
+
|
|
18
|
+
# revision identifiers, used by Alembic.
|
|
19
|
+
revision = '001'
|
|
20
|
+
down_revision = None
|
|
21
|
+
branch_labels = None
|
|
22
|
+
depends_on = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def upgrade():
|
|
26
|
+
"""Create initial schema and add all backwards compatibility columns"""
|
|
27
|
+
with op.get_context().autocommit_block():
|
|
28
|
+
# Create all tables with their current schema
|
|
29
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
30
|
+
|
|
31
|
+
# Add backwards compatibility columns using helper function that matches
|
|
32
|
+
# original add_column_to_table_sqlalchemy behavior exactly
|
|
33
|
+
|
|
34
|
+
# Spot table columns
|
|
35
|
+
db_utils.add_column_to_table_alembic('spot', 'failure_reason',
|
|
36
|
+
sa.Text())
|
|
37
|
+
db_utils.add_column_to_table_alembic('spot',
|
|
38
|
+
'spot_job_id',
|
|
39
|
+
sa.Integer(),
|
|
40
|
+
copy_from='job_id')
|
|
41
|
+
db_utils.add_column_to_table_alembic(
|
|
42
|
+
'spot',
|
|
43
|
+
'task_id',
|
|
44
|
+
sa.Integer(),
|
|
45
|
+
server_default='0',
|
|
46
|
+
value_to_replace_existing_entries=0)
|
|
47
|
+
db_utils.add_column_to_table_alembic('spot',
|
|
48
|
+
'task_name',
|
|
49
|
+
sa.Text(),
|
|
50
|
+
copy_from='job_name')
|
|
51
|
+
db_utils.add_column_to_table_alembic(
|
|
52
|
+
'spot',
|
|
53
|
+
'specs',
|
|
54
|
+
sa.Text(),
|
|
55
|
+
value_to_replace_existing_entries=json.dumps(
|
|
56
|
+
{'max_restarts_on_errors': 0}))
|
|
57
|
+
db_utils.add_column_to_table_alembic('spot', 'local_log_file',
|
|
58
|
+
sa.Text())
|
|
59
|
+
db_utils.add_column_to_table_alembic(
|
|
60
|
+
'spot',
|
|
61
|
+
'metadata',
|
|
62
|
+
sa.Text(),
|
|
63
|
+
server_default='{}',
|
|
64
|
+
value_to_replace_existing_entries='{}')
|
|
65
|
+
|
|
66
|
+
# Job info table columns
|
|
67
|
+
db_utils.add_column_to_table_alembic('job_info', 'schedule_state',
|
|
68
|
+
sa.Text())
|
|
69
|
+
db_utils.add_column_to_table_alembic('job_info', 'controller_pid',
|
|
70
|
+
sa.Integer())
|
|
71
|
+
db_utils.add_column_to_table_alembic('job_info', 'dag_yaml_path',
|
|
72
|
+
sa.Text())
|
|
73
|
+
db_utils.add_column_to_table_alembic('job_info', 'env_file_path',
|
|
74
|
+
sa.Text())
|
|
75
|
+
db_utils.add_column_to_table_alembic('job_info', 'user_hash', sa.Text())
|
|
76
|
+
db_utils.add_column_to_table_alembic(
|
|
77
|
+
'job_info',
|
|
78
|
+
'workspace',
|
|
79
|
+
sa.Text(),
|
|
80
|
+
value_to_replace_existing_entries=constants.
|
|
81
|
+
SKYPILOT_DEFAULT_WORKSPACE)
|
|
82
|
+
db_utils.add_column_to_table_alembic(
|
|
83
|
+
'job_info',
|
|
84
|
+
'priority',
|
|
85
|
+
sa.Integer(),
|
|
86
|
+
server_default=str(constants.DEFAULT_PRIORITY),
|
|
87
|
+
value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
|
|
88
|
+
db_utils.add_column_to_table_alembic('job_info', 'entrypoint',
|
|
89
|
+
sa.Text())
|
|
90
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
91
|
+
'original_user_yaml_path',
|
|
92
|
+
sa.Text())
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def downgrade():
|
|
96
|
+
"""Drop all tables"""
|
|
97
|
+
Base.metadata.drop_all(bind=op.get_bind())
|
sky/serve/client/sdk.py
CHANGED
|
@@ -292,12 +292,13 @@ def status(
|
|
|
292
292
|
|
|
293
293
|
@usage_lib.entrypoint
|
|
294
294
|
@server_common.check_server_healthy_or_start
|
|
295
|
-
@rest.
|
|
295
|
+
@rest.retry_transient_errors()
|
|
296
296
|
def tail_logs(service_name: str,
|
|
297
297
|
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
298
298
|
replica_id: Optional[int] = None,
|
|
299
299
|
follow: bool = True,
|
|
300
|
-
output_stream: Optional['io.TextIOBase'] = None
|
|
300
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
|
301
|
+
tail: Optional[int] = None) -> None:
|
|
301
302
|
"""Tails logs for a service.
|
|
302
303
|
|
|
303
304
|
Usage:
|
|
@@ -367,6 +368,7 @@ def tail_logs(service_name: str,
|
|
|
367
368
|
target=target,
|
|
368
369
|
replica_id=replica_id,
|
|
369
370
|
follow=follow,
|
|
371
|
+
tail=tail,
|
|
370
372
|
)
|
|
371
373
|
response = server_common.make_authenticated_request(
|
|
372
374
|
'POST',
|
|
@@ -390,7 +392,8 @@ def sync_down_logs(service_name: str,
|
|
|
390
392
|
str, 'serve_utils.ServiceComponent',
|
|
391
393
|
List[Union[str,
|
|
392
394
|
'serve_utils.ServiceComponent']]]] = None,
|
|
393
|
-
replica_ids: Optional[List[int]] = None
|
|
395
|
+
replica_ids: Optional[List[int]] = None,
|
|
396
|
+
tail: Optional[int] = None) -> None:
|
|
394
397
|
"""Sync down logs from the service components to a local directory.
|
|
395
398
|
|
|
396
399
|
This function syncs logs from the specified service components (controller,
|
|
@@ -429,6 +432,7 @@ def sync_down_logs(service_name: str,
|
|
|
429
432
|
local_dir=local_dir,
|
|
430
433
|
targets=targets,
|
|
431
434
|
replica_ids=replica_ids,
|
|
435
|
+
tail=tail,
|
|
432
436
|
)
|
|
433
437
|
response = server_common.make_authenticated_request(
|
|
434
438
|
'POST',
|
sky/serve/controller.py
CHANGED
|
@@ -156,9 +156,13 @@ class SkyServeController:
|
|
|
156
156
|
return responses.JSONResponse(content={'message': 'Success'},
|
|
157
157
|
status_code=200)
|
|
158
158
|
except Exception as e: # pylint: disable=broad-except
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
return responses.JSONResponse(content={
|
|
159
|
+
exception_str = common_utils.format_exception(e)
|
|
160
|
+
logger.error(f'Error in update_service: {exception_str}')
|
|
161
|
+
return responses.JSONResponse(content={
|
|
162
|
+
'message': 'Error',
|
|
163
|
+
'exception': exception_str,
|
|
164
|
+
'traceback': traceback.format_exc()
|
|
165
|
+
},
|
|
162
166
|
status_code=500)
|
|
163
167
|
|
|
164
168
|
@self._app.post('/controller/terminate_replica')
|
sky/serve/serve_state.py
CHANGED
sky/serve/serve_utils.py
CHANGED
|
@@ -12,8 +12,8 @@ import shutil
|
|
|
12
12
|
import threading
|
|
13
13
|
import time
|
|
14
14
|
import typing
|
|
15
|
-
from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator,
|
|
16
|
-
Optional, TextIO, Type, TypeVar, Union)
|
|
15
|
+
from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
|
|
16
|
+
List, Optional, TextIO, Type, TypeVar, Union)
|
|
17
17
|
import uuid
|
|
18
18
|
|
|
19
19
|
import colorama
|
|
@@ -782,6 +782,54 @@ def get_latest_version_with_min_replicas(
|
|
|
782
782
|
return active_versions[-1] if active_versions else None
|
|
783
783
|
|
|
784
784
|
|
|
785
|
+
def _process_line(line: str,
|
|
786
|
+
cluster_name: str,
|
|
787
|
+
stop_on_eof: bool = False) -> Iterator[str]:
|
|
788
|
+
# The line might be directing users to view logs, like
|
|
789
|
+
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
790
|
+
# We should tail the detailed logs for user.
|
|
791
|
+
def cluster_is_up() -> bool:
|
|
792
|
+
cluster_record = global_user_state.get_cluster_from_name(cluster_name)
|
|
793
|
+
if cluster_record is None:
|
|
794
|
+
return False
|
|
795
|
+
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
796
|
+
|
|
797
|
+
provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
|
|
798
|
+
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
799
|
+
|
|
800
|
+
if provision_log_prompt is not None:
|
|
801
|
+
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
|
|
802
|
+
|
|
803
|
+
try:
|
|
804
|
+
with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
|
|
805
|
+
# We still exit if more than 10 seconds without new content
|
|
806
|
+
# to avoid any internal bug that causes the launch to fail
|
|
807
|
+
# while cluster status remains INIT.
|
|
808
|
+
yield from log_utils.follow_logs(f,
|
|
809
|
+
should_stop=cluster_is_up,
|
|
810
|
+
stop_on_eof=stop_on_eof,
|
|
811
|
+
idle_timeout_seconds=10)
|
|
812
|
+
except FileNotFoundError:
|
|
813
|
+
yield line
|
|
814
|
+
|
|
815
|
+
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
816
|
+
f'Try to expand log file {nested_log_path} but not '
|
|
817
|
+
f'found. Skipping...{colorama.Style.RESET_ALL}')
|
|
818
|
+
pass
|
|
819
|
+
return
|
|
820
|
+
|
|
821
|
+
if log_prompt is not None:
|
|
822
|
+
# Now we skip other logs (file sync logs) since we lack
|
|
823
|
+
# utility to determine when these log files are finished
|
|
824
|
+
# writing.
|
|
825
|
+
# TODO(tian): We should not skip these logs since there are
|
|
826
|
+
# small chance that error will happen in file sync. Need to
|
|
827
|
+
# find a better way to do this.
|
|
828
|
+
return
|
|
829
|
+
|
|
830
|
+
yield line
|
|
831
|
+
|
|
832
|
+
|
|
785
833
|
def _follow_logs_with_provision_expanding(
|
|
786
834
|
file: TextIO,
|
|
787
835
|
cluster_name: str,
|
|
@@ -804,51 +852,8 @@ def _follow_logs_with_provision_expanding(
|
|
|
804
852
|
Log lines, including expanded content from referenced provision logs.
|
|
805
853
|
"""
|
|
806
854
|
|
|
807
|
-
def cluster_is_up() -> bool:
|
|
808
|
-
cluster_record = global_user_state.get_cluster_from_name(cluster_name)
|
|
809
|
-
if cluster_record is None:
|
|
810
|
-
return False
|
|
811
|
-
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
812
|
-
|
|
813
855
|
def process_line(line: str) -> Iterator[str]:
|
|
814
|
-
|
|
815
|
-
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
816
|
-
# We should tail the detailed logs for user.
|
|
817
|
-
provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
|
|
818
|
-
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
819
|
-
|
|
820
|
-
if provision_log_prompt is not None:
|
|
821
|
-
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
|
|
822
|
-
|
|
823
|
-
try:
|
|
824
|
-
with open(nested_log_path, 'r', newline='',
|
|
825
|
-
encoding='utf-8') as f:
|
|
826
|
-
# We still exit if more than 10 seconds without new content
|
|
827
|
-
# to avoid any internal bug that causes the launch to fail
|
|
828
|
-
# while cluster status remains INIT.
|
|
829
|
-
yield from log_utils.follow_logs(f,
|
|
830
|
-
should_stop=cluster_is_up,
|
|
831
|
-
stop_on_eof=stop_on_eof,
|
|
832
|
-
idle_timeout_seconds=10)
|
|
833
|
-
except FileNotFoundError:
|
|
834
|
-
yield line
|
|
835
|
-
|
|
836
|
-
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
837
|
-
f'Try to expand log file {nested_log_path} but not '
|
|
838
|
-
f'found. Skipping...{colorama.Style.RESET_ALL}')
|
|
839
|
-
pass
|
|
840
|
-
return
|
|
841
|
-
|
|
842
|
-
if log_prompt is not None:
|
|
843
|
-
# Now we skip other logs (file sync logs) since we lack
|
|
844
|
-
# utility to determine when these log files are finished
|
|
845
|
-
# writing.
|
|
846
|
-
# TODO(tian): We should not skip these logs since there are
|
|
847
|
-
# small chance that error will happen in file sync. Need to
|
|
848
|
-
# find a better way to do this.
|
|
849
|
-
return
|
|
850
|
-
|
|
851
|
-
yield line
|
|
856
|
+
yield from _process_line(line, cluster_name, stop_on_eof=stop_on_eof)
|
|
852
857
|
|
|
853
858
|
return log_utils.follow_logs(file,
|
|
854
859
|
should_stop=should_stop,
|
|
@@ -857,18 +862,51 @@ def _follow_logs_with_provision_expanding(
|
|
|
857
862
|
idle_timeout_seconds=idle_timeout_seconds)
|
|
858
863
|
|
|
859
864
|
|
|
860
|
-
def
|
|
861
|
-
|
|
865
|
+
def _capped_follow_logs_with_provision_expanding(
|
|
866
|
+
log_list: List[str],
|
|
867
|
+
cluster_name: str,
|
|
868
|
+
*,
|
|
869
|
+
line_cap: int = 100,
|
|
870
|
+
) -> Iterator[str]:
|
|
871
|
+
"""Follows logs and expands any provision.log references found.
|
|
872
|
+
|
|
873
|
+
Args:
|
|
874
|
+
log_list: List of Log Lines to read from.
|
|
875
|
+
cluster_name: Name of the cluster being launched.
|
|
876
|
+
line_cap: Number of last lines to return
|
|
877
|
+
|
|
878
|
+
Yields:
|
|
879
|
+
Log lines, including expanded content from referenced provision logs.
|
|
880
|
+
"""
|
|
881
|
+
all_lines: Deque[str] = collections.deque(maxlen=line_cap)
|
|
882
|
+
|
|
883
|
+
for line in log_list:
|
|
884
|
+
for processed in _process_line(line=line,
|
|
885
|
+
cluster_name=cluster_name,
|
|
886
|
+
stop_on_eof=False):
|
|
887
|
+
all_lines.append(processed)
|
|
888
|
+
|
|
889
|
+
yield from all_lines
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
893
|
+
tail: Optional[int]) -> str:
|
|
862
894
|
msg = check_service_status_healthy(service_name)
|
|
863
895
|
if msg is not None:
|
|
864
896
|
return msg
|
|
865
897
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
|
|
866
898
|
f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
|
|
867
|
-
|
|
868
899
|
log_file_name = generate_replica_log_file_name(service_name, replica_id)
|
|
869
900
|
if os.path.exists(log_file_name):
|
|
870
|
-
|
|
871
|
-
|
|
901
|
+
if tail is not None:
|
|
902
|
+
lines = common_utils.read_last_n_lines(log_file_name, tail)
|
|
903
|
+
for line in lines:
|
|
904
|
+
if not line.endswith('\n'):
|
|
905
|
+
line += '\n'
|
|
906
|
+
print(line, end='', flush=True)
|
|
907
|
+
else:
|
|
908
|
+
with open(log_file_name, 'r', encoding='utf-8') as f:
|
|
909
|
+
print(f.read(), flush=True)
|
|
872
910
|
return ''
|
|
873
911
|
|
|
874
912
|
launch_log_file_name = generate_replica_launch_log_file_name(
|
|
@@ -891,24 +929,48 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
|
891
929
|
|
|
892
930
|
replica_provisioned = (
|
|
893
931
|
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
932
|
+
|
|
933
|
+
# Handle launch logs based on number parameter
|
|
934
|
+
final_lines_to_print = []
|
|
935
|
+
if tail is not None:
|
|
936
|
+
static_lines = common_utils.read_last_n_lines(launch_log_file_name,
|
|
937
|
+
tail)
|
|
938
|
+
lines = list(
|
|
939
|
+
_capped_follow_logs_with_provision_expanding(
|
|
940
|
+
log_list=static_lines,
|
|
941
|
+
cluster_name=replica_cluster_name,
|
|
942
|
+
line_cap=tail,
|
|
943
|
+
))
|
|
944
|
+
final_lines_to_print += lines
|
|
945
|
+
else:
|
|
946
|
+
with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
|
|
947
|
+
for line in _follow_logs_with_provision_expanding(
|
|
948
|
+
f,
|
|
949
|
+
replica_cluster_name,
|
|
950
|
+
should_stop=replica_provisioned,
|
|
951
|
+
stop_on_eof=not follow,
|
|
952
|
+
):
|
|
953
|
+
print(line, end='', flush=True)
|
|
902
954
|
|
|
903
955
|
if (not follow and
|
|
904
956
|
_get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
|
|
905
957
|
# Early exit if not following the logs.
|
|
958
|
+
if tail is not None:
|
|
959
|
+
for line in final_lines_to_print:
|
|
960
|
+
if not line.endswith('\n'):
|
|
961
|
+
line += '\n'
|
|
962
|
+
print(line, end='', flush=True)
|
|
906
963
|
return ''
|
|
907
964
|
|
|
908
965
|
backend = backends.CloudVmRayBackend()
|
|
909
966
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
910
967
|
replica_cluster_name)
|
|
911
968
|
if handle is None:
|
|
969
|
+
if tail is not None:
|
|
970
|
+
for line in final_lines_to_print:
|
|
971
|
+
if not line.endswith('\n'):
|
|
972
|
+
line += '\n'
|
|
973
|
+
print(line, end='', flush=True)
|
|
912
974
|
return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)
|
|
913
975
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
914
976
|
|
|
@@ -917,15 +979,37 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
|
917
979
|
f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
|
|
918
980
|
|
|
919
981
|
# Always tail the latest logs, which represent user setup & run.
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
982
|
+
if tail is None:
|
|
983
|
+
returncode = backend.tail_logs(handle, job_id=None, follow=follow)
|
|
984
|
+
if returncode != 0:
|
|
985
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for replica '
|
|
986
|
+
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
987
|
+
elif not follow and tail > 0:
|
|
988
|
+
final = backend.tail_logs(handle,
|
|
989
|
+
job_id=None,
|
|
990
|
+
follow=follow,
|
|
991
|
+
tail=tail,
|
|
992
|
+
stream_logs=False,
|
|
993
|
+
require_outputs=True,
|
|
994
|
+
process_stream=True)
|
|
995
|
+
if isinstance(final, int) or (final[0] != 0 and final[0] != 101):
|
|
996
|
+
if tail is not None:
|
|
997
|
+
for line in final_lines_to_print:
|
|
998
|
+
if not line.endswith('\n'):
|
|
999
|
+
line += '\n'
|
|
1000
|
+
print(line, end='', flush=True)
|
|
1001
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for replica '
|
|
1002
|
+
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
1003
|
+
final_lines_to_print += final[1].splitlines()
|
|
1004
|
+
for line in final_lines_to_print[-tail:]:
|
|
1005
|
+
if not line.endswith('\n'):
|
|
1006
|
+
line += '\n'
|
|
1007
|
+
print(line, end='', flush=True)
|
|
924
1008
|
return ''
|
|
925
1009
|
|
|
926
1010
|
|
|
927
1011
|
def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
928
|
-
follow: bool) -> str:
|
|
1012
|
+
follow: bool, tail: Optional[int]) -> str:
|
|
929
1013
|
msg = check_service_status_healthy(service_name)
|
|
930
1014
|
if msg is not None:
|
|
931
1015
|
return msg
|
|
@@ -940,14 +1024,24 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
|
940
1024
|
return True
|
|
941
1025
|
return record['status'] in serve_state.ServiceStatus.failed_statuses()
|
|
942
1026
|
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
):
|
|
1027
|
+
if tail is not None:
|
|
1028
|
+
lines = common_utils.read_last_n_lines(os.path.expanduser(log_file),
|
|
1029
|
+
tail)
|
|
1030
|
+
for line in lines:
|
|
1031
|
+
if not line.endswith('\n'):
|
|
1032
|
+
line += '\n'
|
|
950
1033
|
print(line, end='', flush=True)
|
|
1034
|
+
else:
|
|
1035
|
+
with open(os.path.expanduser(log_file),
|
|
1036
|
+
'r',
|
|
1037
|
+
newline='',
|
|
1038
|
+
encoding='utf-8') as f:
|
|
1039
|
+
for line in log_utils.follow_logs(
|
|
1040
|
+
f,
|
|
1041
|
+
should_stop=_service_is_terminal,
|
|
1042
|
+
stop_on_eof=not follow,
|
|
1043
|
+
):
|
|
1044
|
+
print(line, end='', flush=True)
|
|
951
1045
|
return ''
|
|
952
1046
|
|
|
953
1047
|
|
|
@@ -1140,20 +1234,22 @@ class ServeCodeGen:
|
|
|
1140
1234
|
|
|
1141
1235
|
@classmethod
|
|
1142
1236
|
def stream_replica_logs(cls, service_name: str, replica_id: int,
|
|
1143
|
-
follow: bool) -> str:
|
|
1237
|
+
follow: bool, tail: Optional[int]) -> str:
|
|
1144
1238
|
code = [
|
|
1145
1239
|
'msg = serve_utils.stream_replica_logs('
|
|
1146
|
-
f'{service_name!r}, {replica_id!r}, follow={follow})',
|
|
1240
|
+
f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail})',
|
|
1147
1241
|
'print(msg, flush=True)'
|
|
1148
1242
|
]
|
|
1149
1243
|
return cls._build(code)
|
|
1150
1244
|
|
|
1151
1245
|
@classmethod
|
|
1152
1246
|
def stream_serve_process_logs(cls, service_name: str,
|
|
1153
|
-
stream_controller: bool, follow: bool
|
|
1247
|
+
stream_controller: bool, follow: bool,
|
|
1248
|
+
tail: Optional[int]) -> str:
|
|
1154
1249
|
code = [
|
|
1155
1250
|
f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
|
|
1156
|
-
f'{stream_controller}, follow={follow}
|
|
1251
|
+
f'{stream_controller}, follow={follow}, tail={tail})',
|
|
1252
|
+
'print(msg, flush=True)'
|
|
1157
1253
|
]
|
|
1158
1254
|
return cls._build(code)
|
|
1159
1255
|
|
sky/serve/server/core.py
CHANGED
|
@@ -740,6 +740,7 @@ def tail_logs(
|
|
|
740
740
|
target: ServiceComponentOrStr,
|
|
741
741
|
replica_id: Optional[int] = None,
|
|
742
742
|
follow: bool = True,
|
|
743
|
+
tail: Optional[int] = None,
|
|
743
744
|
) -> None:
|
|
744
745
|
"""Tails logs for a service.
|
|
745
746
|
|
|
@@ -805,11 +806,14 @@ def tail_logs(
|
|
|
805
806
|
service_name,
|
|
806
807
|
stream_controller=(
|
|
807
808
|
target == serve_utils.ServiceComponent.CONTROLLER),
|
|
808
|
-
follow=follow
|
|
809
|
+
follow=follow,
|
|
810
|
+
tail=tail)
|
|
809
811
|
else:
|
|
810
812
|
assert replica_id is not None, service_name
|
|
811
|
-
code = serve_utils.ServeCodeGen.stream_replica_logs(
|
|
812
|
-
|
|
813
|
+
code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
|
|
814
|
+
replica_id,
|
|
815
|
+
follow,
|
|
816
|
+
tail=tail)
|
|
813
817
|
|
|
814
818
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
|
815
819
|
# kill the process, so we need to handle it manually here.
|
|
@@ -834,6 +838,7 @@ def sync_down_logs(
|
|
|
834
838
|
targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
|
|
835
839
|
None] = None,
|
|
836
840
|
replica_ids: Optional[List[int]] = None,
|
|
841
|
+
tail: Optional[int] = None,
|
|
837
842
|
) -> str:
|
|
838
843
|
"""Sync down logs from the controller for the given service.
|
|
839
844
|
|
|
@@ -936,16 +941,22 @@ def sync_down_logs(
|
|
|
936
941
|
if component == serve_utils.ServiceComponent.CONTROLLER:
|
|
937
942
|
stream_logs_code = (
|
|
938
943
|
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
939
|
-
service_name,
|
|
944
|
+
service_name,
|
|
945
|
+
stream_controller=True,
|
|
946
|
+
follow=False,
|
|
947
|
+
tail=tail))
|
|
940
948
|
elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
|
|
941
949
|
stream_logs_code = (
|
|
942
950
|
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
943
|
-
service_name,
|
|
951
|
+
service_name,
|
|
952
|
+
stream_controller=False,
|
|
953
|
+
follow=False,
|
|
954
|
+
tail=tail))
|
|
944
955
|
elif component == serve_utils.ServiceComponent.REPLICA:
|
|
945
956
|
replica_id = target.replica_id
|
|
946
957
|
assert replica_id is not None, service_name
|
|
947
958
|
stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
|
|
948
|
-
service_name, replica_id, follow=False)
|
|
959
|
+
service_name, replica_id, follow=False, tail=tail)
|
|
949
960
|
else:
|
|
950
961
|
assert False, component
|
|
951
962
|
|
sky/server/common.py
CHANGED
|
@@ -371,6 +371,10 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
371
371
|
|
|
372
372
|
|
|
373
373
|
def handle_request_error(response: 'requests.Response') -> None:
|
|
374
|
+
# Keep the original HTTPError if the response code >= 400
|
|
375
|
+
response.raise_for_status()
|
|
376
|
+
# Other status codes are not expected neither, e.g. we do not expect to
|
|
377
|
+
# handle redirection here.
|
|
374
378
|
if response.status_code != 200:
|
|
375
379
|
with ux_utils.print_exception_no_traceback():
|
|
376
380
|
raise RuntimeError(
|
sky/server/requests/payloads.py
CHANGED
|
@@ -557,6 +557,7 @@ class ServeLogsBody(RequestBody):
|
|
|
557
557
|
target: Union[str, serve.ServiceComponent]
|
|
558
558
|
replica_id: Optional[int] = None
|
|
559
559
|
follow: bool = True
|
|
560
|
+
tail: Optional[int] = None
|
|
560
561
|
|
|
561
562
|
|
|
562
563
|
class ServeDownloadLogsBody(RequestBody):
|
|
@@ -566,6 +567,7 @@ class ServeDownloadLogsBody(RequestBody):
|
|
|
566
567
|
targets: Optional[Union[str, serve.ServiceComponent,
|
|
567
568
|
List[Union[str, serve.ServiceComponent]]]]
|
|
568
569
|
replica_ids: Optional[List[int]] = None
|
|
570
|
+
tail: Optional[int] = None
|
|
569
571
|
|
|
570
572
|
|
|
571
573
|
class ServeStatusBody(RequestBody):
|
sky/server/requests/requests.py
CHANGED
|
@@ -29,10 +29,10 @@ from sky.server.requests.serializers import decoders
|
|
|
29
29
|
from sky.server.requests.serializers import encoders
|
|
30
30
|
from sky.utils import common
|
|
31
31
|
from sky.utils import common_utils
|
|
32
|
-
from sky.utils import db_utils
|
|
33
32
|
from sky.utils import env_options
|
|
34
33
|
from sky.utils import subprocess_utils
|
|
35
34
|
from sky.utils import ux_utils
|
|
35
|
+
from sky.utils.db import db_utils
|
|
36
36
|
|
|
37
37
|
logger = sky_logging.init_logger(__name__)
|
|
38
38
|
|