skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +33 -4
- sky/check.py +11 -1
- sky/client/cli/command.py +208 -93
- sky/client/sdk.py +14 -1
- sky/client/sdk_async.py +4 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
- sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-1e6de35d15a8d432.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +14 -2
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +43 -2
- sky/jobs/server/core.py +48 -1
- sky/jobs/server/server.py +52 -3
- sky/jobs/state.py +5 -1
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/serve/client/impl.py +85 -1
- sky/serve/client/sdk.py +16 -47
- sky/serve/constants.py +2 -1
- sky/serve/controller.py +4 -2
- sky/serve/serve_state.py +28 -5
- sky/serve/serve_utils.py +77 -46
- sky/serve/server/core.py +13 -197
- sky/serve/server/impl.py +239 -2
- sky/serve/service.py +8 -3
- sky/server/common.py +11 -4
- sky/server/constants.py +1 -1
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +19 -0
- sky/task.py +18 -11
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/templates/sky-serve-controller.yaml.j2 +1 -0
- sky/usage/usage_lib.py +8 -6
- sky/utils/annotations.py +8 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/db/migration_utils.py +2 -2
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +18 -13
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +95 -92
- sky/client/sdk.pyi +0 -301
- sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
- /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
sky/serve/client/sdk.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
"""SDK for SkyServe."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import List, Optional, Union
|
|
4
|
+
from typing import List, Optional, Sequence, Union
|
|
5
5
|
|
|
6
|
-
from sky.client import common as client_common
|
|
7
6
|
from sky.serve.client import impl
|
|
8
7
|
from sky.server import common as server_common
|
|
9
8
|
from sky.server import rest
|
|
@@ -290,27 +289,13 @@ def tail_logs(service_name: str,
|
|
|
290
289
|
sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
|
|
291
290
|
ValueError: arguments not valid, or failed to tail the logs.
|
|
292
291
|
"""
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
follow=follow,
|
|
301
|
-
tail=tail,
|
|
302
|
-
)
|
|
303
|
-
response = server_common.make_authenticated_request(
|
|
304
|
-
'POST',
|
|
305
|
-
'/serve/logs',
|
|
306
|
-
json=json.loads(body.model_dump_json()),
|
|
307
|
-
timeout=(5, None),
|
|
308
|
-
stream=True)
|
|
309
|
-
request_id = server_common.get_request_id(response)
|
|
310
|
-
return sdk.stream_response(request_id=request_id,
|
|
311
|
-
response=response,
|
|
312
|
-
output_stream=output_stream,
|
|
313
|
-
resumable=True)
|
|
292
|
+
return impl.tail_logs(service_name,
|
|
293
|
+
target,
|
|
294
|
+
replica_id,
|
|
295
|
+
follow,
|
|
296
|
+
output_stream,
|
|
297
|
+
tail,
|
|
298
|
+
pool=False)
|
|
314
299
|
|
|
315
300
|
|
|
316
301
|
@usage_lib.entrypoint
|
|
@@ -320,8 +305,8 @@ def sync_down_logs(service_name: str,
|
|
|
320
305
|
*,
|
|
321
306
|
targets: Optional[Union[
|
|
322
307
|
str, 'serve_utils.ServiceComponent',
|
|
323
|
-
|
|
324
|
-
|
|
308
|
+
Sequence[Union[str,
|
|
309
|
+
'serve_utils.ServiceComponent']]]] = None,
|
|
325
310
|
replica_ids: Optional[List[int]] = None,
|
|
326
311
|
tail: Optional[int] = None) -> None:
|
|
327
312
|
"""Sync down logs from the service components to a local directory.
|
|
@@ -352,25 +337,9 @@ def sync_down_logs(service_name: str,
|
|
|
352
337
|
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
|
353
338
|
ValueError: Arguments not valid.
|
|
354
339
|
"""
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
# to a directory on the API server.
|
|
362
|
-
local_dir=local_dir,
|
|
363
|
-
targets=targets,
|
|
364
|
-
replica_ids=replica_ids,
|
|
365
|
-
tail=tail,
|
|
366
|
-
)
|
|
367
|
-
response = server_common.make_authenticated_request(
|
|
368
|
-
'POST',
|
|
369
|
-
'/serve/sync-down-logs',
|
|
370
|
-
json=json.loads(body.model_dump_json()),
|
|
371
|
-
timeout=(5, None))
|
|
372
|
-
remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
|
|
373
|
-
|
|
374
|
-
# Download from API server paths to the client's local_dir
|
|
375
|
-
client_common.download_logs_from_api_server([remote_dir], remote_dir,
|
|
376
|
-
local_dir)
|
|
340
|
+
return impl.sync_down_logs(service_name,
|
|
341
|
+
local_dir,
|
|
342
|
+
targets=targets,
|
|
343
|
+
replica_ids=replica_ids,
|
|
344
|
+
tail=tail,
|
|
345
|
+
pool=False)
|
sky/serve/constants.py
CHANGED
|
@@ -106,7 +106,8 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
|
|
|
106
106
|
# v2.0 - Added template-replica feature.
|
|
107
107
|
# v3.0 - Added cluster pool.
|
|
108
108
|
# v4.0 - Added pool argument to wait_service_registration.
|
|
109
|
-
|
|
109
|
+
# v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
|
|
110
|
+
SERVE_VERSION = 5
|
|
110
111
|
|
|
111
112
|
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
|
112
113
|
'The version of service is outdated and does not support manually '
|
sky/serve/controller.py
CHANGED
|
@@ -27,11 +27,12 @@ from sky.utils import ux_utils
|
|
|
27
27
|
logger = sky_logging.init_logger(__name__)
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
class
|
|
30
|
+
class AutoscalerInfoFilter(logging.Filter):
|
|
31
31
|
|
|
32
32
|
def filter(self, record: logging.LogRecord) -> bool:
|
|
33
33
|
message = record.getMessage()
|
|
34
|
-
return not ('GET' in message and '200' in message
|
|
34
|
+
return not ('GET' in message and '200' in message and
|
|
35
|
+
'/autoscaler/info' in message)
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class SkyServeController:
|
|
@@ -61,6 +62,7 @@ class SkyServeController:
|
|
|
61
62
|
uvicorn_access_logger = logging.getLogger('uvicorn.access')
|
|
62
63
|
for handler in uvicorn_access_logger.handlers:
|
|
63
64
|
handler.setFormatter(sky_logging.FORMATTER)
|
|
65
|
+
handler.addFilter(AutoscalerInfoFilter())
|
|
64
66
|
yield
|
|
65
67
|
|
|
66
68
|
def _run_autoscaler(self):
|
sky/serve/serve_state.py
CHANGED
|
@@ -9,6 +9,7 @@ import sqlite3
|
|
|
9
9
|
import threading
|
|
10
10
|
import typing
|
|
11
11
|
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
import uuid
|
|
12
13
|
|
|
13
14
|
import colorama
|
|
14
15
|
|
|
@@ -82,6 +83,13 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
|
|
|
82
83
|
'controller_pid',
|
|
83
84
|
'INTEGER DEFAULT NULL',
|
|
84
85
|
value_to_replace_existing_entries=-1)
|
|
86
|
+
# The service hash. Unique for each service, even if the service name is
|
|
87
|
+
# the same.
|
|
88
|
+
db_utils.add_column_to_table(cursor, conn, 'services', 'hash',
|
|
89
|
+
'TEXT DEFAULT NULL')
|
|
90
|
+
# Entrypoint to launch the service.
|
|
91
|
+
db_utils.add_column_to_table(cursor, conn, 'services', 'entrypoint',
|
|
92
|
+
'TEXT DEFAULT NULL')
|
|
85
93
|
conn.commit()
|
|
86
94
|
|
|
87
95
|
|
|
@@ -284,7 +292,7 @@ _SERVICE_STATUS_TO_COLOR = {
|
|
|
284
292
|
def add_service(name: str, controller_job_id: int, policy: str,
|
|
285
293
|
requested_resources_str: str, load_balancing_policy: str,
|
|
286
294
|
status: ServiceStatus, tls_encrypted: bool, pool: bool,
|
|
287
|
-
controller_pid: int) -> bool:
|
|
295
|
+
controller_pid: int, entrypoint: str) -> bool:
|
|
288
296
|
"""Add a service in the database.
|
|
289
297
|
|
|
290
298
|
Returns:
|
|
@@ -299,11 +307,12 @@ def add_service(name: str, controller_job_id: int, policy: str,
|
|
|
299
307
|
INSERT INTO services
|
|
300
308
|
(name, controller_job_id, status, policy,
|
|
301
309
|
requested_resources_str, load_balancing_policy, tls_encrypted,
|
|
302
|
-
pool, controller_pid)
|
|
303
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
310
|
+
pool, controller_pid, hash, entrypoint)
|
|
311
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
304
312
|
(name, controller_job_id, status.value, policy,
|
|
305
313
|
requested_resources_str, load_balancing_policy,
|
|
306
|
-
int(tls_encrypted), int(pool), controller_pid
|
|
314
|
+
int(tls_encrypted), int(pool), controller_pid, str(
|
|
315
|
+
uuid.uuid4()), entrypoint))
|
|
307
316
|
|
|
308
317
|
except sqlite3.IntegrityError as e:
|
|
309
318
|
if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG:
|
|
@@ -397,7 +406,7 @@ def _get_service_from_row(row) -> Dict[str, Any]:
|
|
|
397
406
|
(current_version, name, controller_job_id, controller_port,
|
|
398
407
|
load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
|
|
399
408
|
_, active_versions, load_balancing_policy, tls_encrypted, pool,
|
|
400
|
-
controller_pid) = row[:
|
|
409
|
+
controller_pid, svc_hash, entrypoint) = row[:19]
|
|
401
410
|
record = {
|
|
402
411
|
'name': name,
|
|
403
412
|
'controller_job_id': controller_job_id,
|
|
@@ -418,6 +427,8 @@ def _get_service_from_row(row) -> Dict[str, Any]:
|
|
|
418
427
|
'tls_encrypted': bool(tls_encrypted),
|
|
419
428
|
'pool': bool(pool),
|
|
420
429
|
'controller_pid': controller_pid,
|
|
430
|
+
'hash': svc_hash,
|
|
431
|
+
'entrypoint': entrypoint,
|
|
421
432
|
}
|
|
422
433
|
latest_spec = get_spec(name, current_version)
|
|
423
434
|
if latest_spec is not None:
|
|
@@ -459,6 +470,18 @@ def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
|
|
|
459
470
|
return None
|
|
460
471
|
|
|
461
472
|
|
|
473
|
+
@init_db
|
|
474
|
+
def get_service_hash(service_name: str) -> Optional[str]:
|
|
475
|
+
"""Get the hash of a service."""
|
|
476
|
+
assert _DB_PATH is not None
|
|
477
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
|
478
|
+
rows = cursor.execute('SELECT hash FROM services WHERE name=(?)',
|
|
479
|
+
(service_name,)).fetchall()
|
|
480
|
+
for row in rows:
|
|
481
|
+
return row[0]
|
|
482
|
+
return None
|
|
483
|
+
|
|
484
|
+
|
|
462
485
|
@init_db
|
|
463
486
|
def get_service_versions(service_name: str) -> List[int]:
|
|
464
487
|
"""Gets all versions of a service."""
|
sky/serve/serve_utils.py
CHANGED
|
@@ -20,6 +20,7 @@ import uuid
|
|
|
20
20
|
|
|
21
21
|
import colorama
|
|
22
22
|
import filelock
|
|
23
|
+
import yaml
|
|
23
24
|
|
|
24
25
|
from sky import backends
|
|
25
26
|
from sky import exceptions
|
|
@@ -65,13 +66,12 @@ def get_num_service_threshold():
|
|
|
65
66
|
|
|
66
67
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
67
68
|
|
|
68
|
-
# NOTE(dev): We assume log
|
|
69
|
-
#
|
|
70
|
-
#
|
|
71
|
-
|
|
72
|
-
_SKYPILOT_PROVISION_LOG_PATTERN = (
|
|
73
|
-
|
|
74
|
-
_SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
|
|
69
|
+
# NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
|
|
70
|
+
# when changing UX as this assumption is used to expand some log files while
|
|
71
|
+
# ignoring others.
|
|
72
|
+
_SKYPILOT_LOG_HINT = r'.*sky api logs -l'
|
|
73
|
+
_SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
|
|
74
|
+
_SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
|
|
75
75
|
|
|
76
76
|
# TODO(tian): Find all existing replica id and print here.
|
|
77
77
|
_FAILED_TO_FIND_REPLICA_MSG = (
|
|
@@ -668,12 +668,18 @@ def _get_service_status(
|
|
|
668
668
|
if record['pool']:
|
|
669
669
|
latest_yaml_path = generate_task_yaml_file_name(service_name,
|
|
670
670
|
record['version'])
|
|
671
|
-
|
|
672
|
-
original_config.
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
original_config
|
|
671
|
+
raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
|
|
672
|
+
original_config = raw_yaml_config.get('_user_specified_yaml')
|
|
673
|
+
if original_config is None:
|
|
674
|
+
# Fall back to old display format.
|
|
675
|
+
original_config = raw_yaml_config
|
|
676
|
+
original_config.pop('run', None)
|
|
677
|
+
svc: Dict[str, Any] = original_config.pop('service')
|
|
678
|
+
if svc is not None:
|
|
679
|
+
svc.pop('pool', None) # Remove pool from service config
|
|
680
|
+
original_config['pool'] = svc # Add pool to root config
|
|
681
|
+
else:
|
|
682
|
+
original_config = yaml.safe_load(original_config)
|
|
677
683
|
record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
|
|
678
684
|
|
|
679
685
|
record['target_num_replicas'] = 0
|
|
@@ -959,8 +965,10 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
959
965
|
"""
|
|
960
966
|
start_time = time.time()
|
|
961
967
|
setup_completed = False
|
|
968
|
+
noun = 'pool' if pool else 'service'
|
|
962
969
|
while True:
|
|
963
|
-
#
|
|
970
|
+
# Only do this check for non-consolidation mode as consolidation mode
|
|
971
|
+
# has no setup process.
|
|
964
972
|
if not is_consolidation_mode(pool):
|
|
965
973
|
job_status = job_lib.get_status(job_id)
|
|
966
974
|
if job_status is None or job_status < job_lib.JobStatus.RUNNING:
|
|
@@ -971,7 +979,7 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
971
979
|
with ux_utils.print_exception_no_traceback():
|
|
972
980
|
raise RuntimeError(
|
|
973
981
|
f'Failed to start the controller process for '
|
|
974
|
-
f'the
|
|
982
|
+
f'the {noun} {service_name!r} within '
|
|
975
983
|
f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
|
|
976
984
|
f' seconds.')
|
|
977
985
|
# No need to check the service status as the controller process
|
|
@@ -979,22 +987,26 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
979
987
|
time.sleep(1)
|
|
980
988
|
continue
|
|
981
989
|
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
990
|
+
if not setup_completed:
|
|
991
|
+
setup_completed = True
|
|
992
|
+
# Reset the start time to wait for the service to be registered.
|
|
993
|
+
start_time = time.time()
|
|
986
994
|
|
|
987
|
-
record =
|
|
995
|
+
record = _get_service_status(service_name,
|
|
996
|
+
pool=pool,
|
|
997
|
+
with_replica_info=False)
|
|
988
998
|
if record is not None:
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
999
|
+
if job_id != record['controller_job_id']:
|
|
1000
|
+
if pool:
|
|
1001
|
+
command_to_run = 'sky jobs pool apply --pool'
|
|
1002
|
+
else:
|
|
1003
|
+
command_to_run = 'sky serve update'
|
|
992
1004
|
with ux_utils.print_exception_no_traceback():
|
|
993
1005
|
raise ValueError(
|
|
994
|
-
f'The
|
|
995
|
-
'Please specify a different name for your
|
|
996
|
-
'To update an existing
|
|
997
|
-
f'{service_name} <new-
|
|
1006
|
+
f'The {noun} {service_name!r} is already running. '
|
|
1007
|
+
f'Please specify a different name for your {noun}. '
|
|
1008
|
+
f'To update an existing {noun}, run: {command_to_run}'
|
|
1009
|
+
f' {service_name} <new-{noun}-yaml>')
|
|
998
1010
|
lb_port = record['load_balancer_port']
|
|
999
1011
|
if lb_port is not None:
|
|
1000
1012
|
return message_utils.encode_payload(lb_port)
|
|
@@ -1023,12 +1035,16 @@ def load_service_initialization_result(payload: str) -> int:
|
|
|
1023
1035
|
return message_utils.decode_payload(payload)
|
|
1024
1036
|
|
|
1025
1037
|
|
|
1026
|
-
def
|
|
1027
|
-
|
|
1038
|
+
def _check_service_status_healthy(service_name: str,
|
|
1039
|
+
pool: bool) -> Optional[str]:
|
|
1040
|
+
service_record = _get_service_status(service_name,
|
|
1041
|
+
pool,
|
|
1042
|
+
with_replica_info=False)
|
|
1043
|
+
capnoun = 'Service' if not pool else 'Pool'
|
|
1028
1044
|
if service_record is None:
|
|
1029
|
-
return f'
|
|
1045
|
+
return f'{capnoun} {service_name!r} does not exist.'
|
|
1030
1046
|
if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
|
|
1031
|
-
return (f'
|
|
1047
|
+
return (f'{capnoun} {service_name!r} is still initializing its '
|
|
1032
1048
|
'controller. Please try again later.')
|
|
1033
1049
|
return None
|
|
1034
1050
|
|
|
@@ -1067,7 +1083,10 @@ def _process_line(line: str,
|
|
|
1067
1083
|
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
1068
1084
|
|
|
1069
1085
|
if provision_log_prompt is not None:
|
|
1070
|
-
|
|
1086
|
+
log_path = provision_log_prompt.group(1)
|
|
1087
|
+
nested_log_path = pathlib.Path(
|
|
1088
|
+
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1089
|
+
log_path).resolve()
|
|
1071
1090
|
|
|
1072
1091
|
try:
|
|
1073
1092
|
with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
|
|
@@ -1159,12 +1178,14 @@ def _capped_follow_logs_with_provision_expanding(
|
|
|
1159
1178
|
|
|
1160
1179
|
|
|
1161
1180
|
def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
1162
|
-
tail: Optional[int]) -> str:
|
|
1163
|
-
msg =
|
|
1181
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1182
|
+
msg = _check_service_status_healthy(service_name, pool=pool)
|
|
1164
1183
|
if msg is not None:
|
|
1165
1184
|
return msg
|
|
1185
|
+
repnoun = 'worker' if pool else 'replica'
|
|
1186
|
+
caprepnoun = repnoun.capitalize()
|
|
1166
1187
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
|
|
1167
|
-
f'of
|
|
1188
|
+
f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
|
|
1168
1189
|
log_file_name = generate_replica_log_file_name(service_name, replica_id)
|
|
1169
1190
|
if os.path.exists(log_file_name):
|
|
1170
1191
|
if tail is not None:
|
|
@@ -1181,7 +1202,7 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1181
1202
|
launch_log_file_name = generate_replica_launch_log_file_name(
|
|
1182
1203
|
service_name, replica_id)
|
|
1183
1204
|
if not os.path.exists(launch_log_file_name):
|
|
1184
|
-
return (f'{colorama.Fore.RED}
|
|
1205
|
+
return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
|
|
1185
1206
|
f'{colorama.Style.RESET_ALL}')
|
|
1186
1207
|
|
|
1187
1208
|
replica_cluster_name = generate_replica_cluster_name(
|
|
@@ -1231,6 +1252,10 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1231
1252
|
print(line, end='', flush=True)
|
|
1232
1253
|
return ''
|
|
1233
1254
|
|
|
1255
|
+
# For pools, we don't stream the job logs as the run section is ignored.
|
|
1256
|
+
if pool:
|
|
1257
|
+
return ''
|
|
1258
|
+
|
|
1234
1259
|
backend = backends.CloudVmRayBackend()
|
|
1235
1260
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
1236
1261
|
replica_cluster_name)
|
|
@@ -1245,13 +1270,13 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1245
1270
|
|
|
1246
1271
|
# Notify user here to make sure user won't think the log is finished.
|
|
1247
1272
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
|
|
1248
|
-
f'of
|
|
1273
|
+
f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
|
|
1249
1274
|
|
|
1250
1275
|
# Always tail the latest logs, which represent user setup & run.
|
|
1251
1276
|
if tail is None:
|
|
1252
1277
|
returncode = backend.tail_logs(handle, job_id=None, follow=follow)
|
|
1253
1278
|
if returncode != 0:
|
|
1254
|
-
return (f'{colorama.Fore.RED}Failed to stream logs for
|
|
1279
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
|
|
1255
1280
|
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
1256
1281
|
elif not follow and tail > 0:
|
|
1257
1282
|
final = backend.tail_logs(handle,
|
|
@@ -1278,8 +1303,9 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1278
1303
|
|
|
1279
1304
|
|
|
1280
1305
|
def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
1281
|
-
follow: bool, tail: Optional[int]
|
|
1282
|
-
|
|
1306
|
+
follow: bool, tail: Optional[int],
|
|
1307
|
+
pool: bool) -> str:
|
|
1308
|
+
msg = _check_service_status_healthy(service_name, pool)
|
|
1283
1309
|
if msg is not None:
|
|
1284
1310
|
return msg
|
|
1285
1311
|
if stream_controller:
|
|
@@ -1288,7 +1314,9 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
|
1288
1314
|
log_file = generate_remote_load_balancer_log_file_name(service_name)
|
|
1289
1315
|
|
|
1290
1316
|
def _service_is_terminal() -> bool:
|
|
1291
|
-
record =
|
|
1317
|
+
record = _get_service_status(service_name,
|
|
1318
|
+
pool,
|
|
1319
|
+
with_replica_info=False)
|
|
1292
1320
|
if record is None:
|
|
1293
1321
|
return True
|
|
1294
1322
|
return record['status'] in serve_state.ServiceStatus.failed_statuses()
|
|
@@ -1531,21 +1559,24 @@ class ServeCodeGen:
|
|
|
1531
1559
|
|
|
1532
1560
|
@classmethod
|
|
1533
1561
|
def stream_replica_logs(cls, service_name: str, replica_id: int,
|
|
1534
|
-
follow: bool, tail: Optional[int]
|
|
1562
|
+
follow: bool, tail: Optional[int],
|
|
1563
|
+
pool: bool) -> str:
|
|
1535
1564
|
code = [
|
|
1565
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1536
1566
|
'msg = serve_utils.stream_replica_logs('
|
|
1537
|
-
f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}
|
|
1538
|
-
'print(msg, flush=True)'
|
|
1567
|
+
f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
|
|
1568
|
+
'**kwargs)', 'print(msg, flush=True)'
|
|
1539
1569
|
]
|
|
1540
1570
|
return cls._build(code)
|
|
1541
1571
|
|
|
1542
1572
|
@classmethod
|
|
1543
1573
|
def stream_serve_process_logs(cls, service_name: str,
|
|
1544
1574
|
stream_controller: bool, follow: bool,
|
|
1545
|
-
tail: Optional[int]) -> str:
|
|
1575
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1546
1576
|
code = [
|
|
1577
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1547
1578
|
f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
|
|
1548
|
-
f'{stream_controller}, follow={follow}, tail={tail})',
|
|
1579
|
+
f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
|
|
1549
1580
|
'print(msg, flush=True)'
|
|
1550
1581
|
]
|
|
1551
1582
|
return cls._build(code)
|