skypilot-nightly 1.0.0.dev20250118__py3-none-any.whl → 1.0.0.dev20250120__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/provision/gcp/instance_utils.py +15 -9
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +54 -2
- sky/serve/load_balancer.py +27 -10
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/utils/schemas.py +13 -0
- {skypilot_nightly-1.0.0.dev20250118.dist-info → skypilot_nightly-1.0.0.dev20250120.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250118.dist-info → skypilot_nightly-1.0.0.dev20250120.dist-info}/RECORD +18 -18
- {skypilot_nightly-1.0.0.dev20250118.dist-info → skypilot_nightly-1.0.0.dev20250120.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250118.dist-info → skypilot_nightly-1.0.0.dev20250120.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250118.dist-info → skypilot_nightly-1.0.0.dev20250120.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250118.dist-info → skypilot_nightly-1.0.0.dev20250120.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'b81d0a0f6d5410948520c3fe021fba07cd8ae21d'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250120'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -38,7 +38,7 @@ _FIREWALL_RESOURCE_NOT_FOUND_PATTERN = re.compile(
|
|
38
38
|
r'The resource \'projects/.*/global/firewalls/.*\' was not found')
|
39
39
|
|
40
40
|
|
41
|
-
def
|
41
|
+
def _retry_on_gcp_http_exception(
|
42
42
|
regex: Optional[str] = None,
|
43
43
|
max_retries: int = GCP_MAX_RETRIES,
|
44
44
|
retry_interval_s: int = GCP_RETRY_INTERVAL_SECONDS,
|
@@ -49,17 +49,18 @@ def _retry_on_http_exception(
|
|
49
49
|
|
50
50
|
@functools.wraps(func)
|
51
51
|
def wrapper(*args, **kwargs):
|
52
|
-
exception_type = gcp.http_error_exception()
|
53
52
|
|
54
53
|
def try_catch_exc():
|
55
54
|
try:
|
56
55
|
value = func(*args, **kwargs)
|
57
56
|
return value
|
58
57
|
except Exception as e: # pylint: disable=broad-except
|
59
|
-
if
|
60
|
-
|
61
|
-
|
62
|
-
|
58
|
+
if (isinstance(e, gcp.http_error_exception()) and
|
59
|
+
(regex is None or re.search(regex, str(e)))):
|
60
|
+
logger.error(
|
61
|
+
f'Retrying for gcp.http_error_exception: {e}')
|
62
|
+
return e
|
63
|
+
raise
|
63
64
|
|
64
65
|
for _ in range(max_retries):
|
65
66
|
ret = try_catch_exc()
|
@@ -431,7 +432,7 @@ class GCPComputeInstance(GCPInstance):
|
|
431
432
|
logger.debug(
|
432
433
|
f'Waiting GCP operation {operation["name"]} to be ready ...')
|
433
434
|
|
434
|
-
@
|
435
|
+
@_retry_on_gcp_http_exception(
|
435
436
|
f'Failed to wait for operation {operation["name"]}')
|
436
437
|
def call_operation(fn, timeout: int):
|
437
438
|
request = fn(
|
@@ -613,6 +614,11 @@ class GCPComputeInstance(GCPInstance):
|
|
613
614
|
return operation
|
614
615
|
|
615
616
|
@classmethod
|
617
|
+
# When there is a cloud function running in parallel to set labels for
|
618
|
+
# newly created instances, it may fail with the following error:
|
619
|
+
# "Labels fingerprint either invalid or resource labels have changed"
|
620
|
+
# We should retry until the labels are set successfully.
|
621
|
+
@_retry_on_gcp_http_exception('Labels fingerprint either invalid')
|
616
622
|
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
|
617
623
|
labels: dict) -> None:
|
618
624
|
node = cls.load_resource().instances().get(
|
@@ -1211,7 +1217,7 @@ class GCPTPUVMInstance(GCPInstance):
|
|
1211
1217
|
"""Poll for TPU operation until finished."""
|
1212
1218
|
del project_id, region, zone # unused
|
1213
1219
|
|
1214
|
-
@
|
1220
|
+
@_retry_on_gcp_http_exception(
|
1215
1221
|
f'Failed to wait for operation {operation["name"]}')
|
1216
1222
|
def call_operation(fn, timeout: int):
|
1217
1223
|
request = fn(name=operation['name'])
|
@@ -1379,7 +1385,7 @@ class GCPTPUVMInstance(GCPInstance):
|
|
1379
1385
|
f'Failed to get VPC name for instance {instance}') from e
|
1380
1386
|
|
1381
1387
|
@classmethod
|
1382
|
-
@
|
1388
|
+
@_retry_on_gcp_http_exception('unable to queue the operation')
|
1383
1389
|
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
|
1384
1390
|
labels: dict) -> None:
|
1385
1391
|
while True:
|