skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +102 -8
- sky/backends/cloud_vm_ray_backend.py +197 -31
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +60 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/core.py +5 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +15 -0
- sky/global_user_state.py +160 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +6 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +22 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +17 -2
- sky/provision/__init__.py +4 -2
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +41 -17
- sky/provision/azure/instance.py +7 -4
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +7 -4
- sky/provision/do/instance.py +7 -4
- sky/provision/fluidstack/instance.py +7 -4
- sky/provision/gcp/instance.py +7 -4
- sky/provision/hyperbolic/instance.py +7 -5
- sky/provision/kubernetes/instance.py +169 -6
- sky/provision/lambda_cloud/instance.py +7 -4
- sky/provision/nebius/instance.py +7 -4
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +7 -5
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +7 -4
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +7 -5
- sky/provision/vast/instance.py +7 -5
- sky/provision/vsphere/instance.py +7 -4
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +1 -1
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +58 -23
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/skypilot_config.py +4 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +9 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +39 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/schemas/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Responses for the API server."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from sky import models
|
|
8
|
+
from sky.server import common
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ResponseBaseModel(pydantic.BaseModel):
|
|
12
|
+
"""A pydantic model that acts like a dict.
|
|
13
|
+
|
|
14
|
+
Supports the following syntax:
|
|
15
|
+
class SampleResponse(DictLikePayload):
|
|
16
|
+
field: str
|
|
17
|
+
|
|
18
|
+
response = SampleResponse(field='value')
|
|
19
|
+
print(response['field']) # prints 'value'
|
|
20
|
+
response['field'] = 'value2'
|
|
21
|
+
print(response['field']) # prints 'value2'
|
|
22
|
+
print('field' in response) # prints True
|
|
23
|
+
|
|
24
|
+
This model exists for backwards compatibility with the
|
|
25
|
+
old SDK that used to return a dict.
|
|
26
|
+
|
|
27
|
+
The backward compatibility may be removed
|
|
28
|
+
in the future.
|
|
29
|
+
"""
|
|
30
|
+
# Ignore extra fields in the request body, which is useful for backward
|
|
31
|
+
# compatibility. The difference with `allow` is that `ignore` will not
|
|
32
|
+
# include the unknown fields when dump the model, i.e., we can add new
|
|
33
|
+
# fields to the request body without breaking the existing old API server
|
|
34
|
+
# where the handler function does not accept the new field in function
|
|
35
|
+
# signature.
|
|
36
|
+
model_config = pydantic.ConfigDict(extra='ignore')
|
|
37
|
+
|
|
38
|
+
# backward compatibility with dict
|
|
39
|
+
# TODO(syang): remove this in v0.13.0
|
|
40
|
+
def __getitem__(self, key):
|
|
41
|
+
try:
|
|
42
|
+
return getattr(self, key)
|
|
43
|
+
except AttributeError as e:
|
|
44
|
+
raise KeyError(key) from e
|
|
45
|
+
|
|
46
|
+
def __setitem__(self, key, value):
|
|
47
|
+
setattr(self, key, value)
|
|
48
|
+
|
|
49
|
+
def __contains__(self, key):
|
|
50
|
+
return hasattr(self, key)
|
|
51
|
+
|
|
52
|
+
def keys(self):
|
|
53
|
+
return self.model_dump().keys()
|
|
54
|
+
|
|
55
|
+
def values(self):
|
|
56
|
+
return self.model_dump().values()
|
|
57
|
+
|
|
58
|
+
def items(self):
|
|
59
|
+
return self.model_dump().items()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class APIHealthResponse(ResponseBaseModel):
|
|
63
|
+
"""Response for the API health endpoint."""
|
|
64
|
+
status: common.ApiServerStatus
|
|
65
|
+
api_version: str = ''
|
|
66
|
+
version: str = ''
|
|
67
|
+
version_on_disk: str = ''
|
|
68
|
+
commit: str = ''
|
|
69
|
+
basic_auth_enabled: bool = False
|
|
70
|
+
user: Optional[models.User] = None
|
|
@@ -22,7 +22,7 @@ depends_on = None
|
|
|
22
22
|
def upgrade():
|
|
23
23
|
with op.get_context().autocommit_block():
|
|
24
24
|
# Create any missing tables with current schema first
|
|
25
|
-
db_utils.
|
|
25
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
26
|
|
|
27
27
|
# Add all missing columns to clusters table
|
|
28
28
|
# This allows each column addition to fail independently without rolling
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Columns for whether the cluster is managed.
|
|
2
|
+
|
|
3
|
+
Revision ID: 005
|
|
4
|
+
Revises: 004
|
|
5
|
+
Create Date: 2025-08-08
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
from sky.global_user_state import Base
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.π
|
|
17
|
+
revision: str = '005'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '004'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add new table for cluster events."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
# Add new table for cluster events.
|
|
27
|
+
db_utils.add_table_to_db_sqlalchemy(Base.metadata, op.get_bind(),
|
|
28
|
+
'cluster_events')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def downgrade():
|
|
32
|
+
pass
|
|
@@ -26,7 +26,7 @@ def upgrade():
|
|
|
26
26
|
"""Create initial schema and add all backwards compatibility columns"""
|
|
27
27
|
with op.get_context().autocommit_block():
|
|
28
28
|
# Create all tables with their current schema
|
|
29
|
-
db_utils.
|
|
29
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
30
30
|
|
|
31
31
|
# Add backwards compatibility columns using helper function that matches
|
|
32
32
|
# original add_column_to_table_sqlalchemy behavior exactly
|
|
@@ -26,7 +26,7 @@ def upgrade():
|
|
|
26
26
|
"""Create initial schema and add all backwards compatibility columns"""
|
|
27
27
|
with op.get_context().autocommit_block():
|
|
28
28
|
# Create all tables with their current schema
|
|
29
|
-
db_utils.
|
|
29
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
30
30
|
|
|
31
31
|
# Add backwards compatibility columns using helper function that matches
|
|
32
32
|
# original add_column_to_table_sqlalchemy behavior exactly
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
|
+
# source: sky/schemas/generated/autostopv1.proto
|
|
4
|
+
# Protobuf Python Version: 5.26.1
|
|
5
|
+
"""Generated protocol buffer code."""
|
|
6
|
+
from google.protobuf import descriptor as _descriptor
|
|
7
|
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
8
|
+
from google.protobuf import symbol_database as _symbol_database
|
|
9
|
+
from google.protobuf.internal import builder as _builder
|
|
10
|
+
# @@protoc_insertion_point(imports)
|
|
11
|
+
|
|
12
|
+
_sym_db = _symbol_database.Default()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n&sky/schemas/generated/autostopv1.proto\x12\x0b\x61utostop.v1\"y\n\x12SetAutostopRequest\x12\x14\n\x0cidle_minutes\x18\x01 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x02 \x01(\t\x12.\n\x08wait_for\x18\x03 \x01(\x0e\x32\x1c.autostop.v1.AutostopWaitFor\x12\x0c\n\x04\x64own\x18\x04 \x01(\x08\"\x15\n\x13SetAutostopResponse\"\x17\n\x15IsAutostoppingRequest\"1\n\x16IsAutostoppingResponse\x12\x17\n\x0fis_autostopping\x18\x01 \x01(\x08*\x90\x01\n\x0f\x41utostopWaitFor\x12!\n\x1d\x41UTOSTOP_WAIT_FOR_UNSPECIFIED\x10\x00\x12\"\n\x1e\x41UTOSTOP_WAIT_FOR_JOBS_AND_SSH\x10\x01\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_JOBS\x10\x02\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_NONE\x10\x03\x32\xbe\x01\n\x0f\x41utostopService\x12P\n\x0bSetAutostop\x12\x1f.autostop.v1.SetAutostopRequest\x1a .autostop.v1.SetAutostopResponse\x12Y\n\x0eIsAutostopping\x12\".autostop.v1.IsAutostoppingRequest\x1a#.autostop.v1.IsAutostoppingResponseb\x06proto3')
|
|
18
|
+
|
|
19
|
+
_globals = globals()
|
|
20
|
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
21
|
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sky.schemas.generated.autostopv1_pb2', _globals)
|
|
22
|
+
if not _descriptor._USE_C_DESCRIPTORS:
|
|
23
|
+
DESCRIPTOR._loaded_options = None
|
|
24
|
+
_globals['_AUTOSTOPWAITFOR']._serialized_start=278
|
|
25
|
+
_globals['_AUTOSTOPWAITFOR']._serialized_end=422
|
|
26
|
+
_globals['_SETAUTOSTOPREQUEST']._serialized_start=55
|
|
27
|
+
_globals['_SETAUTOSTOPREQUEST']._serialized_end=176
|
|
28
|
+
_globals['_SETAUTOSTOPRESPONSE']._serialized_start=178
|
|
29
|
+
_globals['_SETAUTOSTOPRESPONSE']._serialized_end=199
|
|
30
|
+
_globals['_ISAUTOSTOPPINGREQUEST']._serialized_start=201
|
|
31
|
+
_globals['_ISAUTOSTOPPINGREQUEST']._serialized_end=224
|
|
32
|
+
_globals['_ISAUTOSTOPPINGRESPONSE']._serialized_start=226
|
|
33
|
+
_globals['_ISAUTOSTOPPINGRESPONSE']._serialized_end=275
|
|
34
|
+
_globals['_AUTOSTOPSERVICE']._serialized_start=425
|
|
35
|
+
_globals['_AUTOSTOPSERVICE']._serialized_end=615
|
|
36
|
+
# @@protoc_insertion_point(module_scope)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
|
2
|
+
from google.protobuf import descriptor as _descriptor
|
|
3
|
+
from google.protobuf import message as _message
|
|
4
|
+
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
|
5
|
+
|
|
6
|
+
DESCRIPTOR: _descriptor.FileDescriptor
|
|
7
|
+
|
|
8
|
+
class AutostopWaitFor(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
9
|
+
__slots__ = ()
|
|
10
|
+
AUTOSTOP_WAIT_FOR_UNSPECIFIED: _ClassVar[AutostopWaitFor]
|
|
11
|
+
AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: _ClassVar[AutostopWaitFor]
|
|
12
|
+
AUTOSTOP_WAIT_FOR_JOBS: _ClassVar[AutostopWaitFor]
|
|
13
|
+
AUTOSTOP_WAIT_FOR_NONE: _ClassVar[AutostopWaitFor]
|
|
14
|
+
AUTOSTOP_WAIT_FOR_UNSPECIFIED: AutostopWaitFor
|
|
15
|
+
AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: AutostopWaitFor
|
|
16
|
+
AUTOSTOP_WAIT_FOR_JOBS: AutostopWaitFor
|
|
17
|
+
AUTOSTOP_WAIT_FOR_NONE: AutostopWaitFor
|
|
18
|
+
|
|
19
|
+
class SetAutostopRequest(_message.Message):
|
|
20
|
+
__slots__ = ("idle_minutes", "backend", "wait_for", "down")
|
|
21
|
+
IDLE_MINUTES_FIELD_NUMBER: _ClassVar[int]
|
|
22
|
+
BACKEND_FIELD_NUMBER: _ClassVar[int]
|
|
23
|
+
WAIT_FOR_FIELD_NUMBER: _ClassVar[int]
|
|
24
|
+
DOWN_FIELD_NUMBER: _ClassVar[int]
|
|
25
|
+
idle_minutes: int
|
|
26
|
+
backend: str
|
|
27
|
+
wait_for: AutostopWaitFor
|
|
28
|
+
down: bool
|
|
29
|
+
def __init__(self, idle_minutes: _Optional[int] = ..., backend: _Optional[str] = ..., wait_for: _Optional[_Union[AutostopWaitFor, str]] = ..., down: bool = ...) -> None: ...
|
|
30
|
+
|
|
31
|
+
class SetAutostopResponse(_message.Message):
|
|
32
|
+
__slots__ = ()
|
|
33
|
+
def __init__(self) -> None: ...
|
|
34
|
+
|
|
35
|
+
class IsAutostoppingRequest(_message.Message):
|
|
36
|
+
__slots__ = ()
|
|
37
|
+
def __init__(self) -> None: ...
|
|
38
|
+
|
|
39
|
+
class IsAutostoppingResponse(_message.Message):
|
|
40
|
+
__slots__ = ("is_autostopping",)
|
|
41
|
+
IS_AUTOSTOPPING_FIELD_NUMBER: _ClassVar[int]
|
|
42
|
+
is_autostopping: bool
|
|
43
|
+
def __init__(self, is_autostopping: bool = ...) -> None: ...
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
|
2
|
+
"""Client and server classes corresponding to protobuf-defined services."""
|
|
3
|
+
import grpc
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from sky.schemas.generated import autostopv1_pb2 as sky_dot_schemas_dot_generated_dot_autostopv1__pb2
|
|
7
|
+
|
|
8
|
+
GRPC_GENERATED_VERSION = '1.63.0'
|
|
9
|
+
GRPC_VERSION = grpc.__version__
|
|
10
|
+
EXPECTED_ERROR_RELEASE = '1.65.0'
|
|
11
|
+
SCHEDULED_RELEASE_DATE = 'June 25, 2024'
|
|
12
|
+
_version_not_supported = False
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from grpc._utilities import first_version_is_lower
|
|
16
|
+
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
|
|
17
|
+
except ImportError:
|
|
18
|
+
_version_not_supported = True
|
|
19
|
+
|
|
20
|
+
if _version_not_supported:
|
|
21
|
+
warnings.warn(
|
|
22
|
+
f'The grpc package installed is at version {GRPC_VERSION},'
|
|
23
|
+
+ f' but the generated code in sky/schemas/generated/autostopv1_pb2_grpc.py depends on'
|
|
24
|
+
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
|
25
|
+
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
|
26
|
+
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
|
27
|
+
+ f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
|
|
28
|
+
+ f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
|
|
29
|
+
RuntimeWarning
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AutostopServiceStub(object):
|
|
34
|
+
"""Missing associated documentation comment in .proto file."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, channel):
|
|
37
|
+
"""Constructor.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
channel: A grpc.Channel.
|
|
41
|
+
"""
|
|
42
|
+
self.SetAutostop = channel.unary_unary(
|
|
43
|
+
'/autostop.v1.AutostopService/SetAutostop',
|
|
44
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
|
|
45
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
|
|
46
|
+
_registered_method=True)
|
|
47
|
+
self.IsAutostopping = channel.unary_unary(
|
|
48
|
+
'/autostop.v1.AutostopService/IsAutostopping',
|
|
49
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
|
|
50
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
|
|
51
|
+
_registered_method=True)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AutostopServiceServicer(object):
|
|
55
|
+
"""Missing associated documentation comment in .proto file."""
|
|
56
|
+
|
|
57
|
+
def SetAutostop(self, request, context):
|
|
58
|
+
"""Set autostop configuration for the cluster.
|
|
59
|
+
"""
|
|
60
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
61
|
+
context.set_details('Method not implemented!')
|
|
62
|
+
raise NotImplementedError('Method not implemented!')
|
|
63
|
+
|
|
64
|
+
def IsAutostopping(self, request, context):
|
|
65
|
+
"""Check if the cluster is currently autostopping.
|
|
66
|
+
"""
|
|
67
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
68
|
+
context.set_details('Method not implemented!')
|
|
69
|
+
raise NotImplementedError('Method not implemented!')
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def add_AutostopServiceServicer_to_server(servicer, server):
|
|
73
|
+
rpc_method_handlers = {
|
|
74
|
+
'SetAutostop': grpc.unary_unary_rpc_method_handler(
|
|
75
|
+
servicer.SetAutostop,
|
|
76
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.FromString,
|
|
77
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.SerializeToString,
|
|
78
|
+
),
|
|
79
|
+
'IsAutostopping': grpc.unary_unary_rpc_method_handler(
|
|
80
|
+
servicer.IsAutostopping,
|
|
81
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.FromString,
|
|
82
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.SerializeToString,
|
|
83
|
+
),
|
|
84
|
+
}
|
|
85
|
+
generic_handler = grpc.method_handlers_generic_handler(
|
|
86
|
+
'autostop.v1.AutostopService', rpc_method_handlers)
|
|
87
|
+
server.add_generic_rpc_handlers((generic_handler,))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# This class is part of an EXPERIMENTAL API.
|
|
91
|
+
class AutostopService(object):
|
|
92
|
+
"""Missing associated documentation comment in .proto file."""
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def SetAutostop(request,
|
|
96
|
+
target,
|
|
97
|
+
options=(),
|
|
98
|
+
channel_credentials=None,
|
|
99
|
+
call_credentials=None,
|
|
100
|
+
insecure=False,
|
|
101
|
+
compression=None,
|
|
102
|
+
wait_for_ready=None,
|
|
103
|
+
timeout=None,
|
|
104
|
+
metadata=None):
|
|
105
|
+
return grpc.experimental.unary_unary(
|
|
106
|
+
request,
|
|
107
|
+
target,
|
|
108
|
+
'/autostop.v1.AutostopService/SetAutostop',
|
|
109
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
|
|
110
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
|
|
111
|
+
options,
|
|
112
|
+
channel_credentials,
|
|
113
|
+
insecure,
|
|
114
|
+
call_credentials,
|
|
115
|
+
compression,
|
|
116
|
+
wait_for_ready,
|
|
117
|
+
timeout,
|
|
118
|
+
metadata,
|
|
119
|
+
_registered_method=True)
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def IsAutostopping(request,
|
|
123
|
+
target,
|
|
124
|
+
options=(),
|
|
125
|
+
channel_credentials=None,
|
|
126
|
+
call_credentials=None,
|
|
127
|
+
insecure=False,
|
|
128
|
+
compression=None,
|
|
129
|
+
wait_for_ready=None,
|
|
130
|
+
timeout=None,
|
|
131
|
+
metadata=None):
|
|
132
|
+
return grpc.experimental.unary_unary(
|
|
133
|
+
request,
|
|
134
|
+
target,
|
|
135
|
+
'/autostop.v1.AutostopService/IsAutostopping',
|
|
136
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
|
|
137
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
|
|
138
|
+
options,
|
|
139
|
+
channel_credentials,
|
|
140
|
+
insecure,
|
|
141
|
+
call_credentials,
|
|
142
|
+
compression,
|
|
143
|
+
wait_for_ready,
|
|
144
|
+
timeout,
|
|
145
|
+
metadata,
|
|
146
|
+
_registered_method=True)
|
sky/serve/constants.py
CHANGED
|
@@ -73,13 +73,6 @@ CONTROLLER_AUTOSTOP = {
|
|
|
73
73
|
'down': False,
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
-
# Due to the CPU/memory usage of the controller process launched with a job on
|
|
77
|
-
# controller VM (use ray job under the hood), we need to reserve some CPU/memory
|
|
78
|
-
# for each serve controller process.
|
|
79
|
-
# Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
|
|
80
|
-
# services.
|
|
81
|
-
CONTROLLER_MEMORY_USAGE_GB = 1.0
|
|
82
|
-
|
|
83
76
|
# A period of time to initialize your service. Any readiness probe failures
|
|
84
77
|
# during this period will be ignored.
|
|
85
78
|
DEFAULT_INITIAL_DELAY_SECONDS = 1200
|
|
@@ -115,3 +108,6 @@ TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
|
|
115
108
|
|
|
116
109
|
# Dummy run command for cluster pool.
|
|
117
110
|
POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
|
|
111
|
+
|
|
112
|
+
# Error message for max number of services reached.
|
|
113
|
+
MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'
|
sky/serve/replica_managers.py
CHANGED
|
@@ -13,16 +13,16 @@ import typing
|
|
|
13
13
|
from typing import Any, Dict, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
import colorama
|
|
16
|
-
import
|
|
16
|
+
import filelock
|
|
17
17
|
import requests
|
|
18
18
|
|
|
19
|
-
import sky
|
|
20
19
|
from sky import backends
|
|
21
20
|
from sky import core
|
|
22
21
|
from sky import exceptions
|
|
23
22
|
from sky import execution
|
|
24
23
|
from sky import global_user_state
|
|
25
24
|
from sky import sky_logging
|
|
25
|
+
from sky import task as task_lib
|
|
26
26
|
from sky.backends import backend_utils
|
|
27
27
|
from sky.jobs import scheduler as jobs_scheduler
|
|
28
28
|
from sky.serve import constants as serve_constants
|
|
@@ -41,7 +41,6 @@ from sky.utils import status_lib
|
|
|
41
41
|
from sky.utils import ux_utils
|
|
42
42
|
|
|
43
43
|
if typing.TYPE_CHECKING:
|
|
44
|
-
from sky import resources
|
|
45
44
|
from sky.serve import service_spec
|
|
46
45
|
|
|
47
46
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -51,10 +50,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
|
51
50
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
52
51
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
53
52
|
|
|
54
|
-
# Since sky.launch is very resource demanding, we limit the number of
|
|
55
|
-
# concurrent sky.launch process to avoid overloading the machine.
|
|
56
|
-
_MAX_NUM_LAUNCH = psutil.cpu_count() * 2
|
|
57
|
-
|
|
58
53
|
|
|
59
54
|
# TODO(tian): Combine this with
|
|
60
55
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
|
@@ -81,7 +76,7 @@ def launch_cluster(replica_id: int,
|
|
|
81
76
|
try:
|
|
82
77
|
config = common_utils.read_yaml(
|
|
83
78
|
os.path.expanduser(service_task_yaml_path))
|
|
84
|
-
task =
|
|
79
|
+
task = task_lib.Task.from_yaml_config(config)
|
|
85
80
|
if resources_override is not None:
|
|
86
81
|
resources = task.resources
|
|
87
82
|
overrided_resources = [
|
|
@@ -177,7 +172,7 @@ def terminate_cluster(cluster_name: str,
|
|
|
177
172
|
|
|
178
173
|
def _get_resources_ports(service_task_yaml_path: str) -> str:
|
|
179
174
|
"""Get the resources ports used by the task."""
|
|
180
|
-
task =
|
|
175
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
181
176
|
# Already checked all ports are valid in sky.serve.core.up
|
|
182
177
|
assert task.resources, task
|
|
183
178
|
assert task.service is not None, task
|
|
@@ -195,7 +190,7 @@ def _should_use_spot(service_task_yaml_path: str,
|
|
|
195
190
|
if use_spot_override is not None:
|
|
196
191
|
assert isinstance(use_spot_override, bool)
|
|
197
192
|
return use_spot_override
|
|
198
|
-
task =
|
|
193
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
199
194
|
spot_use_resources = [
|
|
200
195
|
resources for resources in task.resources if resources.use_spot
|
|
201
196
|
]
|
|
@@ -688,7 +683,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
688
683
|
service_task_yaml_path: str) -> None:
|
|
689
684
|
super().__init__(service_name, spec)
|
|
690
685
|
self.service_task_yaml_path = service_task_yaml_path
|
|
691
|
-
task =
|
|
686
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
692
687
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
|
693
688
|
spot_placer.SpotPlacer.from_task(spec, task))
|
|
694
689
|
# TODO(tian): Store launch/down pid in the replica table, to make the
|
|
@@ -872,8 +867,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
872
867
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
873
868
|
replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
874
869
|
'replica_jobs')
|
|
875
|
-
|
|
876
|
-
|
|
870
|
+
job_ids = ['1'] if self._is_pool else None
|
|
871
|
+
job_log_file_name = controller_utils.download_and_stream_job_log(
|
|
872
|
+
backend, handle, replica_job_logs_dir, job_ids)
|
|
877
873
|
if job_log_file_name is not None:
|
|
878
874
|
logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
|
|
879
875
|
with open(log_file_name, 'a',
|
|
@@ -981,7 +977,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
981
977
|
# To avoid `dictionary changed size during iteration` error.
|
|
982
978
|
launch_process_pool_snapshot = list(self._launch_process_pool.items())
|
|
983
979
|
for replica_id, p in launch_process_pool_snapshot:
|
|
984
|
-
if
|
|
980
|
+
if p.is_alive():
|
|
981
|
+
continue
|
|
982
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
985
983
|
info = serve_state.get_replica_info_from_id(
|
|
986
984
|
self._service_name, replica_id)
|
|
987
985
|
assert info is not None, replica_id
|
|
@@ -989,8 +987,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
989
987
|
schedule_next_jobs = False
|
|
990
988
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
991
989
|
# sky.launch not started yet
|
|
992
|
-
if
|
|
993
|
-
_MAX_NUM_LAUNCH):
|
|
990
|
+
if controller_utils.can_provision():
|
|
994
991
|
p.start()
|
|
995
992
|
info.status_property.sky_launch_status = (
|
|
996
993
|
ProcessStatus.RUNNING)
|
|
@@ -1044,6 +1041,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1044
1041
|
self._terminate_replica(replica_id,
|
|
1045
1042
|
sync_down_logs=True,
|
|
1046
1043
|
replica_drain_delay_seconds=0)
|
|
1044
|
+
# Try schedule next job after acquiring the lock.
|
|
1045
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1047
1046
|
down_process_pool_snapshot = list(self._down_process_pool.items())
|
|
1048
1047
|
for replica_id, p in down_process_pool_snapshot:
|
|
1049
1048
|
if not p.is_alive():
|
sky/serve/serve_state.py
CHANGED
|
@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
|
|
|
502
502
|
return records
|
|
503
503
|
|
|
504
504
|
|
|
505
|
+
@init_db
|
|
506
|
+
def get_num_services() -> int:
|
|
507
|
+
"""Get the number of services."""
|
|
508
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
509
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
510
|
+
return session.execute(
|
|
511
|
+
sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
|
|
512
|
+
).select_from(services_table)).fetchone()[0]
|
|
513
|
+
|
|
514
|
+
|
|
505
515
|
@init_db
|
|
506
516
|
def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
|
|
507
517
|
"""Get all existing service records."""
|
sky/serve/serve_utils.py
CHANGED
|
@@ -37,6 +37,7 @@ from sky.skylet import job_lib
|
|
|
37
37
|
from sky.utils import annotations
|
|
38
38
|
from sky.utils import command_runner
|
|
39
39
|
from sky.utils import common_utils
|
|
40
|
+
from sky.utils import controller_utils
|
|
40
41
|
from sky.utils import log_utils
|
|
41
42
|
from sky.utils import message_utils
|
|
42
43
|
from sky.utils import resources_utils
|
|
@@ -56,14 +57,6 @@ else:
|
|
|
56
57
|
|
|
57
58
|
logger = sky_logging.init_logger(__name__)
|
|
58
59
|
|
|
59
|
-
|
|
60
|
-
@annotations.lru_cache(scope='request')
|
|
61
|
-
def get_num_service_threshold():
|
|
62
|
-
"""Get number of services threshold, calculating it only when needed."""
|
|
63
|
-
system_memory_gb = psutil.virtual_memory().total // (1024**3)
|
|
64
|
-
return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
|
|
65
|
-
|
|
66
|
-
|
|
67
60
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
68
61
|
|
|
69
62
|
# NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
|
|
@@ -259,14 +252,47 @@ def get_service_filelock_path(pool: str) -> str:
|
|
|
259
252
|
return str(path)
|
|
260
253
|
|
|
261
254
|
|
|
255
|
+
def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
256
|
+
pool: bool) -> None:
|
|
257
|
+
"""Validate the consolidation mode config."""
|
|
258
|
+
# Check whether the consolidation mode config is changed.
|
|
259
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
260
|
+
if current_is_consolidation_mode:
|
|
261
|
+
controller_cn = controller.cluster_name
|
|
262
|
+
if global_user_state.get_cluster_from_name(controller_cn) is not None:
|
|
263
|
+
with ux_utils.print_exception_no_traceback():
|
|
264
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
265
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
266
|
+
f'{controller.controller_type} is enabled, but the '
|
|
267
|
+
f'controller cluster {controller_cn} is still running. '
|
|
268
|
+
'Please terminate the controller cluster first.'
|
|
269
|
+
f'{colorama.Style.RESET_ALL}')
|
|
270
|
+
else:
|
|
271
|
+
noun = 'pool' if pool else 'service'
|
|
272
|
+
all_services = [
|
|
273
|
+
svc for svc in serve_state.get_services() if svc['pool'] == pool
|
|
274
|
+
]
|
|
275
|
+
if all_services:
|
|
276
|
+
with ux_utils.print_exception_no_traceback():
|
|
277
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
278
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
279
|
+
f'{controller.controller_type} is disabled, but there are '
|
|
280
|
+
f'still {len(all_services)} {noun}s running. Please '
|
|
281
|
+
f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
|
|
282
|
+
|
|
283
|
+
|
|
262
284
|
@annotations.lru_cache(scope='request', maxsize=1)
|
|
263
285
|
def is_consolidation_mode(pool: bool = False) -> bool:
|
|
264
286
|
# Use jobs config for pool consolidation mode.
|
|
265
|
-
|
|
287
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
266
288
|
consolidation_mode = skypilot_config.get_nested(
|
|
267
|
-
(controller_type, 'controller', 'consolidation_mode'),
|
|
289
|
+
(controller.controller_type, 'controller', 'consolidation_mode'),
|
|
268
290
|
default_value=False)
|
|
269
|
-
#
|
|
291
|
+
# We should only do this check on API server, as the controller will not
|
|
292
|
+
# have related config and will always seemingly disabled for consolidation
|
|
293
|
+
# mode. Check #6611 for more details.
|
|
294
|
+
if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
295
|
+
_validate_consolidation_mode_config(consolidation_mode, pool)
|
|
270
296
|
return consolidation_mode
|
|
271
297
|
|
|
272
298
|
|
|
@@ -490,6 +516,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
|
|
|
490
516
|
|
|
491
517
|
|
|
492
518
|
def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
|
|
519
|
+
# NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
|
|
520
|
+
# checking replica cluster existence. Be careful when changing it.
|
|
493
521
|
return f'{service_name}-{replica_id}'
|
|
494
522
|
|
|
495
523
|
|
|
@@ -762,9 +790,13 @@ def load_version_string(payload: str) -> str:
|
|
|
762
790
|
return message_utils.decode_payload(payload)
|
|
763
791
|
|
|
764
792
|
|
|
765
|
-
def
|
|
793
|
+
def get_ready_replicas(
|
|
794
|
+
service_name: str) -> List['replica_managers.ReplicaInfo']:
|
|
766
795
|
logger.info(f'Get number of replicas for pool {service_name!r}')
|
|
767
|
-
return
|
|
796
|
+
return [
|
|
797
|
+
info for info in serve_state.get_replica_infos(service_name)
|
|
798
|
+
if info.status == serve_state.ReplicaStatus.READY
|
|
799
|
+
]
|
|
768
800
|
|
|
769
801
|
|
|
770
802
|
def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
@@ -789,12 +821,8 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
|
789
821
|
logger.error(f'Service {service_name!r} is not a cluster pool.')
|
|
790
822
|
return None
|
|
791
823
|
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
792
|
-
|
|
793
824
|
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
794
|
-
ready_replicas =
|
|
795
|
-
info for info in serve_state.get_replica_infos(service_name)
|
|
796
|
-
if info.status == serve_state.ReplicaStatus.READY
|
|
797
|
-
]
|
|
825
|
+
ready_replicas = get_ready_replicas(service_name)
|
|
798
826
|
idle_replicas: List['replica_managers.ReplicaInfo'] = []
|
|
799
827
|
for replica_info in ready_replicas:
|
|
800
828
|
jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
@@ -1010,11 +1038,18 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
1010
1038
|
lb_port = record['load_balancer_port']
|
|
1011
1039
|
if lb_port is not None:
|
|
1012
1040
|
return message_utils.encode_payload(lb_port)
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1041
|
+
else:
|
|
1042
|
+
controller_log_path = os.path.expanduser(
|
|
1043
|
+
generate_remote_controller_log_file_name(service_name))
|
|
1044
|
+
if os.path.exists(controller_log_path):
|
|
1045
|
+
with open(controller_log_path, 'r', encoding='utf-8') as f:
|
|
1046
|
+
log_content = f.read()
|
|
1047
|
+
if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
|
|
1048
|
+
in log_content):
|
|
1049
|
+
with ux_utils.print_exception_no_traceback():
|
|
1050
|
+
raise RuntimeError('Max number of services reached. '
|
|
1051
|
+
'To spin up more services, please '
|
|
1052
|
+
'tear down some existing services.')
|
|
1018
1053
|
elapsed = time.time() - start_time
|
|
1019
1054
|
if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
|
|
1020
1055
|
# Print the controller log to help user debug.
|