skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +33 -4
- sky/catalog/kubernetes_catalog.py +8 -0
- sky/catalog/nebius_catalog.py +0 -1
- sky/check.py +11 -1
- sky/client/cli/command.py +234 -100
- sky/client/sdk.py +30 -9
- sky/client/sdk_async.py +815 -0
- sky/clouds/kubernetes.py +6 -1
- sky/clouds/nebius.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3698-7874720877646365.js → 3850-ff4a9a69d978632b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
- sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9025.7937c16bc8623516.js → 9025.a1bef12d672bb66d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-1e6de35d15a8d432.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-8620d099cbef8608.js → config-dfb9bf07b13045f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +14 -2
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +43 -2
- sky/jobs/client/sdk_async.py +135 -0
- sky/jobs/server/core.py +48 -1
- sky/jobs/server/server.py +52 -3
- sky/jobs/state.py +5 -1
- sky/jobs/utils.py +3 -1
- sky/provision/kubernetes/utils.py +30 -4
- sky/provision/nebius/instance.py +1 -0
- sky/provision/nebius/utils.py +9 -1
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/serve/client/impl.py +85 -1
- sky/serve/client/sdk.py +16 -47
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +3 -1
- sky/serve/controller.py +6 -3
- sky/serve/load_balancer.py +3 -1
- sky/serve/serve_state.py +93 -5
- sky/serve/serve_utils.py +200 -67
- sky/serve/server/core.py +13 -197
- sky/serve/server/impl.py +261 -23
- sky/serve/service.py +15 -3
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +46 -0
- sky/server/auth/oauth2_proxy.py +185 -0
- sky/server/common.py +119 -21
- sky/server/constants.py +1 -1
- sky/server/daemons.py +60 -11
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +19 -0
- sky/server/rest.py +114 -0
- sky/server/server.py +44 -40
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +5 -1
- sky/skylet/skylet.py +3 -1
- sky/task.py +61 -21
- sky/templates/kubernetes-ray.yml.j2 +9 -0
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/sky-serve-controller.yaml.j2 +1 -0
- sky/usage/usage_lib.py +8 -6
- sky/utils/annotations.py +8 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/controller_utils.py +7 -0
- sky/utils/db/migration_utils.py +2 -2
- sky/utils/rich_utils.py +120 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +22 -13
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +120 -112
- sky/client/sdk.pyi +0 -300
- sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
- /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-85426374db04811e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
|
@@ -2032,9 +2032,7 @@ class KubernetesInstanceType:
|
|
|
2032
2032
|
accelerator_type = match.group('accelerator_type')
|
|
2033
2033
|
if accelerator_count:
|
|
2034
2034
|
accelerator_count = int(accelerator_count)
|
|
2035
|
-
|
|
2036
|
-
# the original format.
|
|
2037
|
-
accelerator_type = str(accelerator_type).replace('_', ' ')
|
|
2035
|
+
accelerator_type = str(accelerator_type)
|
|
2038
2036
|
else:
|
|
2039
2037
|
accelerator_count = None
|
|
2040
2038
|
accelerator_type = None
|
|
@@ -2047,7 +2045,7 @@ class KubernetesInstanceType:
|
|
|
2047
2045
|
accelerator_type = prev_match.group('accelerator_type')
|
|
2048
2046
|
if accelerator_count:
|
|
2049
2047
|
accelerator_count = int(accelerator_count)
|
|
2050
|
-
accelerator_type = str(accelerator_type)
|
|
2048
|
+
accelerator_type = str(accelerator_type)
|
|
2051
2049
|
else:
|
|
2052
2050
|
accelerator_count = None
|
|
2053
2051
|
accelerator_type = None
|
|
@@ -2998,6 +2996,13 @@ def get_kubernetes_node_info(
|
|
|
2998
2996
|
# Get all the pods running on the node
|
|
2999
2997
|
if (pod.spec.node_name == node.metadata.name and
|
|
3000
2998
|
pod.status.phase in ['Running', 'Pending']):
|
|
2999
|
+
# Skip pods that should not count against GPU count
|
|
3000
|
+
if should_exclude_pod_from_gpu_allocation(pod):
|
|
3001
|
+
logger.debug(
|
|
3002
|
+
f'Excluding low priority pod '
|
|
3003
|
+
f'{pod.metadata.name} from GPU allocation '
|
|
3004
|
+
f'calculations on node {node.metadata.name}')
|
|
3005
|
+
continue
|
|
3001
3006
|
# Iterate over all the containers in the pod and sum the
|
|
3002
3007
|
# GPU requests
|
|
3003
3008
|
for container in pod.spec.containers:
|
|
@@ -3596,3 +3601,24 @@ def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
|
|
3596
3601
|
time.sleep(retry_delay)
|
|
3597
3602
|
else:
|
|
3598
3603
|
raise
|
|
3604
|
+
|
|
3605
|
+
|
|
3606
|
+
def should_exclude_pod_from_gpu_allocation(pod) -> bool:
|
|
3607
|
+
"""Check if a pod should be excluded from GPU count calculations.
|
|
3608
|
+
|
|
3609
|
+
Some cloud providers run low priority test/verification pods that request
|
|
3610
|
+
GPUs but should not count against real GPU availability since they are
|
|
3611
|
+
designed to be evicted when higher priority workloads need resources.
|
|
3612
|
+
|
|
3613
|
+
Args:
|
|
3614
|
+
pod: Kubernetes pod object
|
|
3615
|
+
|
|
3616
|
+
Returns:
|
|
3617
|
+
bool: True if the pod should be excluded from GPU count calculations.
|
|
3618
|
+
"""
|
|
3619
|
+
# CoreWeave HPC verification pods - identified by namespace
|
|
3620
|
+
if (hasattr(pod.metadata, 'namespace') and
|
|
3621
|
+
pod.metadata.namespace == 'cw-hpc-verification'):
|
|
3622
|
+
return True
|
|
3623
|
+
|
|
3624
|
+
return False
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -134,6 +134,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
134
134
|
image_family=config.node_config['ImageId'],
|
|
135
135
|
disk_size=config.node_config['DiskSize'],
|
|
136
136
|
user_data=config.node_config['UserData'],
|
|
137
|
+
use_spot=config.node_config['use_spot'],
|
|
137
138
|
associate_public_ip_address=(
|
|
138
139
|
not config.provider_config['use_internal_ips']),
|
|
139
140
|
filesystems=config.node_config.get('filesystems', []),
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -168,6 +168,7 @@ def launch(cluster_name_on_cloud: str,
|
|
|
168
168
|
user_data: str,
|
|
169
169
|
associate_public_ip_address: bool,
|
|
170
170
|
filesystems: List[Dict[str, Any]],
|
|
171
|
+
use_spot: bool = False,
|
|
171
172
|
network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
|
|
172
173
|
# Each node must have a unique name to avoid conflicts between
|
|
173
174
|
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
|
@@ -281,7 +282,14 @@ def launch(cluster_name_on_cloud: str,
|
|
|
281
282
|
public_ip_address=nebius.compute().PublicIPAddress()
|
|
282
283
|
if associate_public_ip_address else None,
|
|
283
284
|
)
|
|
284
|
-
]
|
|
285
|
+
],
|
|
286
|
+
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
|
|
287
|
+
if use_spot else None,
|
|
288
|
+
preemptible=nebius.compute().PreemptibleSpec(
|
|
289
|
+
priority=1,
|
|
290
|
+
on_preemption=nebius.compute(
|
|
291
|
+
).PreemptibleSpec.PreemptionPolicy.STOP) if use_spot else None,
|
|
292
|
+
))).wait()
|
|
285
293
|
instance_id = ''
|
|
286
294
|
retry_count = 0
|
|
287
295
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""add workspace column to cluster_history table
|
|
2
|
+
|
|
3
|
+
Revision ID: 002
|
|
4
|
+
Revises: 001
|
|
5
|
+
Create Date: 2025-08-06
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '002'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '001'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Upgrade schema."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
27
|
+
'workspace',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def downgrade() -> None:
|
|
34
|
+
"""Downgrade schema."""
|
|
35
|
+
pass
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Adding a hash column for pool.
|
|
2
|
+
|
|
3
|
+
Revision ID: 003
|
|
4
|
+
Revises: 002
|
|
5
|
+
Create Date: 2025-07-18
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '003'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '002'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add columns for pool hash."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
27
|
+
'pool_hash',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""Remove columns for pool hash."""
|
|
34
|
+
pass
|
sky/serve/client/impl.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Implementation of SDK for SkyServe."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import List, Optional, Union
|
|
4
|
+
from typing import List, Optional, Sequence, Union
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
7
|
|
|
@@ -12,6 +12,8 @@ from sky.utils import admin_policy_utils
|
|
|
12
12
|
from sky.utils import dag_utils
|
|
13
13
|
|
|
14
14
|
if typing.TYPE_CHECKING:
|
|
15
|
+
import io
|
|
16
|
+
|
|
15
17
|
import sky
|
|
16
18
|
from sky.serve import serve_utils
|
|
17
19
|
|
|
@@ -186,3 +188,85 @@ def status(
|
|
|
186
188
|
json=json.loads(body.model_dump_json()),
|
|
187
189
|
timeout=(5, None))
|
|
188
190
|
return server_common.get_request_id(response)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def tail_logs(service_name: str,
|
|
194
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
195
|
+
replica_id: Optional[int] = None,
|
|
196
|
+
follow: bool = True,
|
|
197
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
|
198
|
+
tail: Optional[int] = None,
|
|
199
|
+
pool: bool = False) -> None:
|
|
200
|
+
# Avoid circular import.
|
|
201
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
202
|
+
|
|
203
|
+
if pool:
|
|
204
|
+
body = payloads.JobsPoolLogsBody(
|
|
205
|
+
pool_name=service_name,
|
|
206
|
+
target=target,
|
|
207
|
+
worker_id=replica_id,
|
|
208
|
+
follow=follow,
|
|
209
|
+
tail=tail,
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
body = payloads.ServeLogsBody(
|
|
213
|
+
service_name=service_name,
|
|
214
|
+
target=target,
|
|
215
|
+
replica_id=replica_id,
|
|
216
|
+
follow=follow,
|
|
217
|
+
tail=tail,
|
|
218
|
+
)
|
|
219
|
+
response = server_common.make_authenticated_request(
|
|
220
|
+
'POST',
|
|
221
|
+
'/jobs/pool_logs' if pool else '/serve/logs',
|
|
222
|
+
json=json.loads(body.model_dump_json()),
|
|
223
|
+
timeout=(5, None),
|
|
224
|
+
stream=True)
|
|
225
|
+
request_id = server_common.get_request_id(response)
|
|
226
|
+
return sdk.stream_response(request_id=request_id,
|
|
227
|
+
response=response,
|
|
228
|
+
output_stream=output_stream,
|
|
229
|
+
resumable=True)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def sync_down_logs(service_name: str,
|
|
233
|
+
local_dir: str,
|
|
234
|
+
*,
|
|
235
|
+
targets: Optional[Union[
|
|
236
|
+
str, 'serve_utils.ServiceComponent',
|
|
237
|
+
Sequence[Union[str,
|
|
238
|
+
'serve_utils.ServiceComponent']]]] = None,
|
|
239
|
+
replica_ids: Optional[List[int]] = None,
|
|
240
|
+
tail: Optional[int] = None,
|
|
241
|
+
pool: bool = False) -> None:
|
|
242
|
+
# Avoid circular import.
|
|
243
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
244
|
+
|
|
245
|
+
if pool:
|
|
246
|
+
body = payloads.JobsPoolDownloadLogsBody(
|
|
247
|
+
pool_name=service_name,
|
|
248
|
+
local_dir=local_dir,
|
|
249
|
+
targets=targets,
|
|
250
|
+
worker_ids=replica_ids,
|
|
251
|
+
tail=tail,
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
body = payloads.ServeDownloadLogsBody(
|
|
255
|
+
service_name=service_name,
|
|
256
|
+
# No need to set here, since the server will override it
|
|
257
|
+
# to a directory on the API server.
|
|
258
|
+
local_dir=local_dir,
|
|
259
|
+
targets=targets,
|
|
260
|
+
replica_ids=replica_ids,
|
|
261
|
+
tail=tail,
|
|
262
|
+
)
|
|
263
|
+
response = server_common.make_authenticated_request(
|
|
264
|
+
'POST',
|
|
265
|
+
'/jobs/pool_sync-down-logs' if pool else '/serve/sync-down-logs',
|
|
266
|
+
json=json.loads(body.model_dump_json()),
|
|
267
|
+
timeout=(5, None))
|
|
268
|
+
remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
|
|
269
|
+
|
|
270
|
+
# Download from API server paths to the client's local_dir
|
|
271
|
+
client_common.download_logs_from_api_server([remote_dir], remote_dir,
|
|
272
|
+
local_dir)
|
sky/serve/client/sdk.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
"""SDK for SkyServe."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import List, Optional, Union
|
|
4
|
+
from typing import List, Optional, Sequence, Union
|
|
5
5
|
|
|
6
|
-
from sky.client import common as client_common
|
|
7
6
|
from sky.serve.client import impl
|
|
8
7
|
from sky.server import common as server_common
|
|
9
8
|
from sky.server import rest
|
|
@@ -290,27 +289,13 @@ def tail_logs(service_name: str,
|
|
|
290
289
|
sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
|
|
291
290
|
ValueError: arguments not valid, or failed to tail the logs.
|
|
292
291
|
"""
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
follow=follow,
|
|
301
|
-
tail=tail,
|
|
302
|
-
)
|
|
303
|
-
response = server_common.make_authenticated_request(
|
|
304
|
-
'POST',
|
|
305
|
-
'/serve/logs',
|
|
306
|
-
json=json.loads(body.model_dump_json()),
|
|
307
|
-
timeout=(5, None),
|
|
308
|
-
stream=True)
|
|
309
|
-
request_id = server_common.get_request_id(response)
|
|
310
|
-
return sdk.stream_response(request_id=request_id,
|
|
311
|
-
response=response,
|
|
312
|
-
output_stream=output_stream,
|
|
313
|
-
resumable=True)
|
|
292
|
+
return impl.tail_logs(service_name,
|
|
293
|
+
target,
|
|
294
|
+
replica_id,
|
|
295
|
+
follow,
|
|
296
|
+
output_stream,
|
|
297
|
+
tail,
|
|
298
|
+
pool=False)
|
|
314
299
|
|
|
315
300
|
|
|
316
301
|
@usage_lib.entrypoint
|
|
@@ -320,8 +305,8 @@ def sync_down_logs(service_name: str,
|
|
|
320
305
|
*,
|
|
321
306
|
targets: Optional[Union[
|
|
322
307
|
str, 'serve_utils.ServiceComponent',
|
|
323
|
-
|
|
324
|
-
|
|
308
|
+
Sequence[Union[str,
|
|
309
|
+
'serve_utils.ServiceComponent']]]] = None,
|
|
325
310
|
replica_ids: Optional[List[int]] = None,
|
|
326
311
|
tail: Optional[int] = None) -> None:
|
|
327
312
|
"""Sync down logs from the service components to a local directory.
|
|
@@ -352,25 +337,9 @@ def sync_down_logs(service_name: str,
|
|
|
352
337
|
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
|
353
338
|
ValueError: Arguments not valid.
|
|
354
339
|
"""
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
# to a directory on the API server.
|
|
362
|
-
local_dir=local_dir,
|
|
363
|
-
targets=targets,
|
|
364
|
-
replica_ids=replica_ids,
|
|
365
|
-
tail=tail,
|
|
366
|
-
)
|
|
367
|
-
response = server_common.make_authenticated_request(
|
|
368
|
-
'POST',
|
|
369
|
-
'/serve/sync-down-logs',
|
|
370
|
-
json=json.loads(body.model_dump_json()),
|
|
371
|
-
timeout=(5, None))
|
|
372
|
-
remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
|
|
373
|
-
|
|
374
|
-
# Download from API server paths to the client's local_dir
|
|
375
|
-
client_common.download_logs_from_api_server([remote_dir], remote_dir,
|
|
376
|
-
local_dir)
|
|
340
|
+
return impl.sync_down_logs(service_name,
|
|
341
|
+
local_dir,
|
|
342
|
+
targets=targets,
|
|
343
|
+
replica_ids=replica_ids,
|
|
344
|
+
tail=tail,
|
|
345
|
+
pool=False)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Async SDK for SkyServe."""
|
|
2
|
+
import typing
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sky.client import sdk_async
|
|
6
|
+
from sky.serve.client import sdk
|
|
7
|
+
from sky.usage import usage_lib
|
|
8
|
+
from sky.utils import context_utils
|
|
9
|
+
|
|
10
|
+
if typing.TYPE_CHECKING:
|
|
11
|
+
import io
|
|
12
|
+
|
|
13
|
+
import sky
|
|
14
|
+
from sky.serve import serve_utils
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@usage_lib.entrypoint
|
|
18
|
+
async def up(
|
|
19
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
20
|
+
service_name: str,
|
|
21
|
+
# Internal only:
|
|
22
|
+
# pylint: disable=invalid-name
|
|
23
|
+
_need_confirmation: bool = False,
|
|
24
|
+
stream_logs: Optional[
|
|
25
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
26
|
+
) -> Tuple[str, str]:
|
|
27
|
+
"""Async version of up() that spins up a service."""
|
|
28
|
+
request_id = await context_utils.to_thread(sdk.up, task, service_name,
|
|
29
|
+
_need_confirmation)
|
|
30
|
+
if stream_logs is not None:
|
|
31
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
32
|
+
else:
|
|
33
|
+
return await sdk_async.get(request_id)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@usage_lib.entrypoint
|
|
37
|
+
async def update(
|
|
38
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
39
|
+
service_name: str,
|
|
40
|
+
mode: 'serve_utils.UpdateMode',
|
|
41
|
+
# Internal only:
|
|
42
|
+
# pylint: disable=invalid-name
|
|
43
|
+
_need_confirmation: bool = False,
|
|
44
|
+
stream_logs: Optional[
|
|
45
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
46
|
+
) -> None:
|
|
47
|
+
"""Async version of update() that updates an existing service."""
|
|
48
|
+
request_id = await context_utils.to_thread(sdk.update, task, service_name,
|
|
49
|
+
mode, _need_confirmation)
|
|
50
|
+
if stream_logs is not None:
|
|
51
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
52
|
+
else:
|
|
53
|
+
return await sdk_async.get(request_id)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@usage_lib.entrypoint
|
|
57
|
+
async def down(
|
|
58
|
+
service_names: Optional[Union[str, List[str]]],
|
|
59
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
|
60
|
+
purge: bool = False,
|
|
61
|
+
stream_logs: Optional[
|
|
62
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Async version of down() that tears down a service."""
|
|
65
|
+
request_id = await context_utils.to_thread(sdk.down, service_names, all,
|
|
66
|
+
purge)
|
|
67
|
+
if stream_logs is not None:
|
|
68
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
69
|
+
else:
|
|
70
|
+
return await sdk_async.get(request_id)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@usage_lib.entrypoint
|
|
74
|
+
async def terminate_replica(
|
|
75
|
+
service_name: str,
|
|
76
|
+
replica_id: int,
|
|
77
|
+
purge: bool,
|
|
78
|
+
stream_logs: Optional[
|
|
79
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Async version of terminate_replica() that tears down a specific
|
|
82
|
+
replica."""
|
|
83
|
+
request_id = await context_utils.to_thread(sdk.terminate_replica,
|
|
84
|
+
service_name, replica_id, purge)
|
|
85
|
+
if stream_logs is not None:
|
|
86
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
87
|
+
else:
|
|
88
|
+
return await sdk_async.get(request_id)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@usage_lib.entrypoint
|
|
92
|
+
async def status(
|
|
93
|
+
service_names: Optional[Union[str, List[str]]],
|
|
94
|
+
stream_logs: Optional[
|
|
95
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
96
|
+
) -> List[Dict[str, Any]]:
|
|
97
|
+
"""Async version of status() that sdk_async.gets service statuses."""
|
|
98
|
+
request_id = await context_utils.to_thread(sdk.status, service_names)
|
|
99
|
+
if stream_logs is not None:
|
|
100
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
101
|
+
else:
|
|
102
|
+
return await sdk_async.get(request_id)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@usage_lib.entrypoint
|
|
106
|
+
async def tail_logs(service_name: str,
|
|
107
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
108
|
+
replica_id: Optional[int] = None,
|
|
109
|
+
follow: bool = True,
|
|
110
|
+
output_stream: Optional['io.TextIOBase'] = None) -> None:
|
|
111
|
+
"""Async version of tail_logs() that tails logs for a service."""
|
|
112
|
+
return await context_utils.to_thread(sdk.tail_logs, service_name, target,
|
|
113
|
+
replica_id, follow, output_stream)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@usage_lib.entrypoint
|
|
117
|
+
async def sync_down_logs(service_name: str,
|
|
118
|
+
local_dir: str,
|
|
119
|
+
*,
|
|
120
|
+
targets: Optional[Union[
|
|
121
|
+
str, 'serve_utils.ServiceComponent', List[Union[
|
|
122
|
+
str, 'serve_utils.ServiceComponent']]]] = None,
|
|
123
|
+
replica_ids: Optional[List[int]] = None) -> None:
|
|
124
|
+
"""Async version of sync_down_logs() that syncs down logs from service
|
|
125
|
+
components."""
|
|
126
|
+
return await context_utils.to_thread(sdk.sync_down_logs,
|
|
127
|
+
service_name,
|
|
128
|
+
local_dir,
|
|
129
|
+
targets=targets,
|
|
130
|
+
replica_ids=replica_ids)
|
sky/serve/constants.py
CHANGED
|
@@ -105,7 +105,9 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
|
|
|
105
105
|
# v1.0 - Introduce rolling update.
|
|
106
106
|
# v2.0 - Added template-replica feature.
|
|
107
107
|
# v3.0 - Added cluster pool.
|
|
108
|
-
|
|
108
|
+
# v4.0 - Added pool argument to wait_service_registration.
|
|
109
|
+
# v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
|
|
110
|
+
SERVE_VERSION = 5
|
|
109
111
|
|
|
110
112
|
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
|
111
113
|
'The version of service is outdated and does not support manually '
|
sky/serve/controller.py
CHANGED
|
@@ -4,6 +4,7 @@ Responsible for autoscaling and replica management.
|
|
|
4
4
|
"""
|
|
5
5
|
import contextlib
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
7
8
|
import threading
|
|
8
9
|
import time
|
|
9
10
|
import traceback
|
|
@@ -26,11 +27,12 @@ from sky.utils import ux_utils
|
|
|
26
27
|
logger = sky_logging.init_logger(__name__)
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
class
|
|
30
|
+
class AutoscalerInfoFilter(logging.Filter):
|
|
30
31
|
|
|
31
32
|
def filter(self, record: logging.LogRecord) -> bool:
|
|
32
33
|
message = record.getMessage()
|
|
33
|
-
return not ('GET' in message and '200' in message
|
|
34
|
+
return not ('GET' in message and '200' in message and
|
|
35
|
+
'/autoscaler/info' in message)
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
class SkyServeController:
|
|
@@ -60,6 +62,7 @@ class SkyServeController:
|
|
|
60
62
|
uvicorn_access_logger = logging.getLogger('uvicorn.access')
|
|
61
63
|
for handler in uvicorn_access_logger.handlers:
|
|
62
64
|
handler.setFormatter(sky_logging.FORMATTER)
|
|
65
|
+
handler.addFilter(AutoscalerInfoFilter())
|
|
63
66
|
yield
|
|
64
67
|
|
|
65
68
|
def _run_autoscaler(self):
|
|
@@ -242,7 +245,7 @@ class SkyServeController:
|
|
|
242
245
|
threading.Thread(target=self._run_autoscaler).start()
|
|
243
246
|
|
|
244
247
|
logger.info('SkyServe Controller started on '
|
|
245
|
-
f'http://{self._host}:{self._port}')
|
|
248
|
+
f'http://{self._host}:{self._port}. PID: {os.getpid()}')
|
|
246
249
|
|
|
247
250
|
uvicorn.run(self._app, host=self._host, port=self._port)
|
|
248
251
|
|
sky/serve/load_balancer.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""LoadBalancer: Distribute any incoming request to all ready replicas."""
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
import threading
|
|
5
6
|
import traceback
|
|
6
7
|
from typing import Dict, List, Optional, Union
|
|
@@ -254,7 +255,8 @@ class SkyServeLoadBalancer:
|
|
|
254
255
|
protocol = 'https' if self._tls_credential is not None else 'http'
|
|
255
256
|
|
|
256
257
|
logger.info('SkyServe Load Balancer started on '
|
|
257
|
-
f'{protocol}://0.0.0.0:{self._load_balancer_port}'
|
|
258
|
+
f'{protocol}://0.0.0.0:{self._load_balancer_port}. '
|
|
259
|
+
f'PID: {os.getpid()}')
|
|
258
260
|
|
|
259
261
|
uvicorn.run(self._app,
|
|
260
262
|
host='0.0.0.0',
|