skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Utilities for managing managed job file content.
|
|
2
|
+
|
|
3
|
+
The helpers in this module fetch job file content (DAG YAML/env files) from the
|
|
4
|
+
database-first storage added for managed jobs, transparently falling back to
|
|
5
|
+
legacy on-disk paths when needed. Consumers should prefer the string-based
|
|
6
|
+
helpers so controllers never have to rely on local disk state.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from sky import sky_logging
|
|
13
|
+
from sky.jobs import state as managed_job_state
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_job_dag_content(job_id: int) -> Optional[str]:
|
|
19
|
+
"""Get DAG YAML content for a job from database or disk.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
job_id: The job ID
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
DAG YAML content as string, or None if not found
|
|
26
|
+
"""
|
|
27
|
+
file_info = managed_job_state.get_job_file_contents(job_id)
|
|
28
|
+
|
|
29
|
+
# Prefer content stored in the database
|
|
30
|
+
if file_info['dag_yaml_content'] is not None:
|
|
31
|
+
return file_info['dag_yaml_content']
|
|
32
|
+
|
|
33
|
+
# Fallback to disk path for backward compatibility
|
|
34
|
+
dag_yaml_path = file_info.get('dag_yaml_path')
|
|
35
|
+
if dag_yaml_path and os.path.exists(dag_yaml_path):
|
|
36
|
+
try:
|
|
37
|
+
with open(dag_yaml_path, 'r', encoding='utf-8') as f:
|
|
38
|
+
content = f.read()
|
|
39
|
+
logger.debug('Loaded DAG YAML from disk for job %s: %s', job_id,
|
|
40
|
+
dag_yaml_path)
|
|
41
|
+
return content
|
|
42
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
43
|
+
logger.warning(
|
|
44
|
+
f'Failed to read DAG YAML from disk {dag_yaml_path}: {e}')
|
|
45
|
+
|
|
46
|
+
logger.warning(f'DAG YAML content not found for job {job_id}')
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_job_env_content(job_id: int) -> Optional[str]:
|
|
51
|
+
"""Get environment file content for a job from database or disk.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
job_id: The job ID
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Environment file content as string, or None if not found
|
|
58
|
+
"""
|
|
59
|
+
file_info = managed_job_state.get_job_file_contents(job_id)
|
|
60
|
+
|
|
61
|
+
# Prefer content stored in the database
|
|
62
|
+
if file_info['env_file_content'] is not None:
|
|
63
|
+
return file_info['env_file_content']
|
|
64
|
+
|
|
65
|
+
# Fallback to disk path for backward compatibility
|
|
66
|
+
env_file_path = file_info.get('env_file_path')
|
|
67
|
+
if env_file_path and os.path.exists(env_file_path):
|
|
68
|
+
try:
|
|
69
|
+
with open(env_file_path, 'r', encoding='utf-8') as f:
|
|
70
|
+
content = f.read()
|
|
71
|
+
logger.debug('Loaded environment file from disk for job %s: %s',
|
|
72
|
+
job_id, env_file_path)
|
|
73
|
+
return content
|
|
74
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
75
|
+
logger.warning(
|
|
76
|
+
f'Failed to read environment file from disk {env_file_path}: '
|
|
77
|
+
f'{e}')
|
|
78
|
+
|
|
79
|
+
# Environment file is optional, so don't warn if not found
|
|
80
|
+
return None
|
sky/jobs/log_gc.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Log garbage collection for managed jobs."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
import shutil
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
import anyio
|
|
11
|
+
import filelock
|
|
12
|
+
|
|
13
|
+
from sky import sky_logging
|
|
14
|
+
from sky import skypilot_config
|
|
15
|
+
from sky.jobs import constants as managed_job_constants
|
|
16
|
+
from sky.jobs import state as managed_job_state
|
|
17
|
+
from sky.jobs import utils as managed_job_utils
|
|
18
|
+
from sky.utils import context
|
|
19
|
+
from sky.utils import context_utils
|
|
20
|
+
|
|
21
|
+
logger = sky_logging.init_logger(__name__)
|
|
22
|
+
|
|
23
|
+
# Filelock for garbage collector leader election.
|
|
24
|
+
_JOB_CONTROLLER_GC_LOCK_PATH = os.path.expanduser(
|
|
25
|
+
'~/.sky/locks/job_controller_gc.lock')
|
|
26
|
+
|
|
27
|
+
_DEFAULT_TASK_LOGS_GC_RETENTION_HOURS = 24 * 7
|
|
28
|
+
_DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS = 24 * 7
|
|
29
|
+
|
|
30
|
+
_LEAST_FREQUENT_GC_INTERVAL_SECONDS = 3600
|
|
31
|
+
_MOST_FREQUENT_GC_INTERVAL_SECONDS = 30
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _next_gc_interval(retention_seconds: int) -> int:
|
|
35
|
+
"""Get the next GC interval."""
|
|
36
|
+
# Run the GC at least per hour to ensure hourly accuracy and
|
|
37
|
+
# at most per 30 seconds (when retention_seconds is small) to
|
|
38
|
+
# avoid too frequent cleanup.
|
|
39
|
+
return max(min(retention_seconds, _LEAST_FREQUENT_GC_INTERVAL_SECONDS),
|
|
40
|
+
_MOST_FREQUENT_GC_INTERVAL_SECONDS)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def gc_controller_logs_for_job():
|
|
44
|
+
"""Garbage collect job and controller logs."""
|
|
45
|
+
while True:
|
|
46
|
+
skypilot_config.reload_config()
|
|
47
|
+
controller_logs_retention = skypilot_config.get_nested(
|
|
48
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
49
|
+
_DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS) * 3600
|
|
50
|
+
# Negative value disables the GC
|
|
51
|
+
if controller_logs_retention >= 0:
|
|
52
|
+
logger.info(f'GC controller logs for job: retention '
|
|
53
|
+
f'{controller_logs_retention} seconds')
|
|
54
|
+
try:
|
|
55
|
+
finished = False
|
|
56
|
+
while not finished:
|
|
57
|
+
finished = await _clean_controller_logs_with_retention(
|
|
58
|
+
controller_logs_retention)
|
|
59
|
+
except asyncio.CancelledError:
|
|
60
|
+
logger.info('Managed jobs logs GC task cancelled')
|
|
61
|
+
break
|
|
62
|
+
except Exception as e: # pylint: disable=broad-except
|
|
63
|
+
logger.error(f'Error GC controller logs for job: {e}',
|
|
64
|
+
exc_info=True)
|
|
65
|
+
else:
|
|
66
|
+
logger.info('Controller logs GC is disabled')
|
|
67
|
+
|
|
68
|
+
interval = _next_gc_interval(controller_logs_retention)
|
|
69
|
+
logger.info('Next controller logs GC is scheduled after '
|
|
70
|
+
f'{interval} seconds')
|
|
71
|
+
await asyncio.sleep(interval)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def gc_task_logs_for_job():
|
|
75
|
+
"""Garbage collect task logs for job."""
|
|
76
|
+
while True:
|
|
77
|
+
skypilot_config.reload_config()
|
|
78
|
+
task_logs_retention = skypilot_config.get_nested(
|
|
79
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
80
|
+
_DEFAULT_TASK_LOGS_GC_RETENTION_HOURS) * 3600
|
|
81
|
+
# Negative value disables the GC
|
|
82
|
+
if task_logs_retention >= 0:
|
|
83
|
+
logger.info('GC task logs for job: '
|
|
84
|
+
f'retention {task_logs_retention} seconds')
|
|
85
|
+
try:
|
|
86
|
+
finished = False
|
|
87
|
+
while not finished:
|
|
88
|
+
finished = await _clean_task_logs_with_retention(
|
|
89
|
+
task_logs_retention)
|
|
90
|
+
except asyncio.CancelledError:
|
|
91
|
+
logger.info('Task logs GC task cancelled')
|
|
92
|
+
break
|
|
93
|
+
except Exception as e: # pylint: disable=broad-except
|
|
94
|
+
logger.error(f'Error GC task logs for job: {e}', exc_info=True)
|
|
95
|
+
else:
|
|
96
|
+
logger.info('Controller logs GC is disabled')
|
|
97
|
+
|
|
98
|
+
interval = _next_gc_interval(task_logs_retention)
|
|
99
|
+
logger.info(f'Next task logs GC is scheduled after {interval} seconds')
|
|
100
|
+
await asyncio.sleep(_next_gc_interval(task_logs_retention))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
async def _clean_controller_logs_with_retention(retention_seconds: int,
|
|
104
|
+
batch_size: int = 100):
|
|
105
|
+
"""Clean controller logs with retention.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Whether the GC of this round has finished, False means there might
|
|
109
|
+
still be more controller logs to clean.
|
|
110
|
+
"""
|
|
111
|
+
assert batch_size > 0, 'Batch size must be positive'
|
|
112
|
+
jobs = await managed_job_state.get_controller_logs_to_clean_async(
|
|
113
|
+
retention_seconds, batch_size=batch_size)
|
|
114
|
+
job_ids_to_update = []
|
|
115
|
+
for job in jobs:
|
|
116
|
+
job_ids_to_update.append(job['job_id'])
|
|
117
|
+
log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
|
|
118
|
+
cleaned_at = time.time()
|
|
119
|
+
if await anyio.Path(log_file).exists():
|
|
120
|
+
ts_str = datetime.fromtimestamp(cleaned_at).strftime(
|
|
121
|
+
'%Y-%m-%d %H:%M:%S')
|
|
122
|
+
msg = f'Controller log has been cleaned at {ts_str}.'
|
|
123
|
+
# Sync down logs will reference to this file directly, so we
|
|
124
|
+
# keep the file and delete the content.
|
|
125
|
+
# TODO(aylei): refactor sync down logs if the inode usage
|
|
126
|
+
# becomes an issue.
|
|
127
|
+
async with await anyio.open_file(log_file, 'w',
|
|
128
|
+
encoding='utf-8') as f:
|
|
129
|
+
await f.write(msg + '\n')
|
|
130
|
+
# Batch the update, the timestamp will be not accurate but it's okay.
|
|
131
|
+
await managed_job_state.set_controller_logs_cleaned_async(
|
|
132
|
+
job_ids=job_ids_to_update, logs_cleaned_at=time.time())
|
|
133
|
+
complete = len(jobs) < batch_size
|
|
134
|
+
logger.info(f'Cleaned {len(jobs)} controller logs with retention '
|
|
135
|
+
f'{retention_seconds} seconds, complete: {complete}')
|
|
136
|
+
return complete
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
async def _clean_task_logs_with_retention(retention_seconds: int,
|
|
140
|
+
batch_size: int = 100):
|
|
141
|
+
"""Clean task logs with retention.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Whether the GC of this round has finished, False means there might
|
|
145
|
+
still be more task logs to clean.
|
|
146
|
+
"""
|
|
147
|
+
assert batch_size > 0, 'Batch size must be positive'
|
|
148
|
+
tasks = await managed_job_state.get_task_logs_to_clean_async(
|
|
149
|
+
retention_seconds, batch_size=batch_size)
|
|
150
|
+
tasks_to_update = []
|
|
151
|
+
for task in tasks:
|
|
152
|
+
local_log_file = anyio.Path(task['local_log_file'])
|
|
153
|
+
# We assume the log directory has the following layout:
|
|
154
|
+
# task-id/
|
|
155
|
+
# - run.log
|
|
156
|
+
# - tasks/
|
|
157
|
+
# - run.log
|
|
158
|
+
# and also remove the tasks directory on cleanup.
|
|
159
|
+
task_log_dir = local_log_file.parent.joinpath('tasks')
|
|
160
|
+
await local_log_file.unlink(missing_ok=True)
|
|
161
|
+
await context_utils.to_thread(shutil.rmtree,
|
|
162
|
+
str(task_log_dir),
|
|
163
|
+
ignore_errors=True)
|
|
164
|
+
# We have at least once semantic guarantee for the cleanup here.
|
|
165
|
+
tasks_to_update.append((task['job_id'], task['task_id']))
|
|
166
|
+
await managed_job_state.set_task_logs_cleaned_async(
|
|
167
|
+
tasks=list(tasks_to_update), logs_cleaned_at=time.time())
|
|
168
|
+
complete = len(tasks) < batch_size
|
|
169
|
+
logger.info(f'Cleaned {len(tasks)} task logs with retention '
|
|
170
|
+
f'{retention_seconds} seconds, complete: {complete}')
|
|
171
|
+
return complete
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@context.contextual_async
|
|
175
|
+
async def run_log_gc():
|
|
176
|
+
"""Run the log garbage collector."""
|
|
177
|
+
log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
178
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
179
|
+
log_path = os.path.join(log_dir, 'garbage_collector.log')
|
|
180
|
+
# Remove previous log file
|
|
181
|
+
await anyio.Path(log_path).unlink(missing_ok=True)
|
|
182
|
+
ctx = context.get()
|
|
183
|
+
assert ctx is not None, 'Context is not initialized'
|
|
184
|
+
ctx.redirect_log(pathlib.Path(log_path))
|
|
185
|
+
gc_controller_logs_for_job_task = asyncio.create_task(
|
|
186
|
+
gc_controller_logs_for_job())
|
|
187
|
+
gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
|
|
188
|
+
await asyncio.gather(gc_controller_logs_for_job_task,
|
|
189
|
+
gc_task_logs_for_job_task)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def elect_for_log_gc():
|
|
193
|
+
"""Use filelock to elect for the log garbage collector.
|
|
194
|
+
|
|
195
|
+
The log garbage collector runs in the controller process to avoid the
|
|
196
|
+
overhead of launching a new process and the lifecycle management, the
|
|
197
|
+
threads that does not elected as the log garbage collector just wait.
|
|
198
|
+
on the filelock and bring trivial overhead.
|
|
199
|
+
"""
|
|
200
|
+
with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
|
|
201
|
+
asyncio.run(run_log_gc())
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -70,7 +70,6 @@ class StrategyExecutor:
|
|
|
70
70
|
max_restarts_on_errors: int,
|
|
71
71
|
job_id: int,
|
|
72
72
|
task_id: int,
|
|
73
|
-
job_logger: logging.Logger,
|
|
74
73
|
pool: Optional[str],
|
|
75
74
|
starting: Set[int],
|
|
76
75
|
starting_lock: asyncio.Lock,
|
|
@@ -85,7 +84,6 @@ class StrategyExecutor:
|
|
|
85
84
|
max_restarts_on_errors: Maximum number of restarts on errors.
|
|
86
85
|
job_id: The ID of the job.
|
|
87
86
|
task_id: The ID of the task.
|
|
88
|
-
job_logger: Logger instance for this specific job.
|
|
89
87
|
starting: Set of job IDs that are currently starting.
|
|
90
88
|
starting_lock: Lock to synchronize starting jobs.
|
|
91
89
|
starting_signal: Condition to signal when a job can start.
|
|
@@ -105,7 +103,6 @@ class StrategyExecutor:
|
|
|
105
103
|
self.task_id = task_id
|
|
106
104
|
self.pool = pool
|
|
107
105
|
self.restart_cnt_on_failure = 0
|
|
108
|
-
self._logger = job_logger
|
|
109
106
|
self.job_id_on_pool_cluster: Optional[int] = None
|
|
110
107
|
self.starting = starting
|
|
111
108
|
self.starting_lock = starting_lock
|
|
@@ -119,7 +116,6 @@ class StrategyExecutor:
|
|
|
119
116
|
task: 'task_lib.Task',
|
|
120
117
|
job_id: int,
|
|
121
118
|
task_id: int,
|
|
122
|
-
job_logger: logging.Logger,
|
|
123
119
|
pool: Optional[str],
|
|
124
120
|
starting: Set[int],
|
|
125
121
|
starting_lock: asyncio.Lock,
|
|
@@ -156,7 +152,7 @@ class StrategyExecutor:
|
|
|
156
152
|
assert job_recovery_strategy is not None, job_recovery_name
|
|
157
153
|
return job_recovery_strategy(cluster_name, backend, task,
|
|
158
154
|
max_restarts_on_errors, job_id, task_id,
|
|
159
|
-
|
|
155
|
+
pool, starting, starting_lock,
|
|
160
156
|
starting_signal)
|
|
161
157
|
|
|
162
158
|
async def launch(self) -> float:
|
|
@@ -224,7 +220,7 @@ class StrategyExecutor:
|
|
|
224
220
|
**kwargs,
|
|
225
221
|
_try_cancel_if_cluster_is_init=True,
|
|
226
222
|
)
|
|
227
|
-
|
|
223
|
+
logger.debug(f'sdk.cancel request ID: {request_id}')
|
|
228
224
|
await context_utils.to_thread(
|
|
229
225
|
sdk.get,
|
|
230
226
|
request_id,
|
|
@@ -261,16 +257,15 @@ class StrategyExecutor:
|
|
|
261
257
|
# loop.
|
|
262
258
|
# TODO(zhwu): log the unexpected error to usage collection
|
|
263
259
|
# for future debugging.
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
'refresh the cluster status. Retrying.')
|
|
260
|
+
logger.info(f'Unexpected exception: {e}\nFailed to get the '
|
|
261
|
+
'refresh the cluster status. Retrying.')
|
|
267
262
|
continue
|
|
268
263
|
if cluster_status != status_lib.ClusterStatus.UP:
|
|
269
264
|
# The cluster can be preempted before the job is
|
|
270
265
|
# launched.
|
|
271
266
|
# Break to let the retry launch kick in.
|
|
272
|
-
|
|
273
|
-
|
|
267
|
+
logger.info('The cluster is preempted before the job '
|
|
268
|
+
'is submitted.')
|
|
274
269
|
# TODO(zhwu): we should recover the preemption with the
|
|
275
270
|
# recovery strategy instead of the current while loop.
|
|
276
271
|
break
|
|
@@ -279,7 +274,6 @@ class StrategyExecutor:
|
|
|
279
274
|
status = await managed_job_utils.get_job_status(
|
|
280
275
|
self.backend,
|
|
281
276
|
self.cluster_name,
|
|
282
|
-
job_logger=self._logger,
|
|
283
277
|
job_id=self.job_id_on_pool_cluster)
|
|
284
278
|
except Exception as e: # pylint: disable=broad-except
|
|
285
279
|
# If any unexpected error happens, retry the job checking
|
|
@@ -288,9 +282,8 @@ class StrategyExecutor:
|
|
|
288
282
|
# get_job_status, so it should not happen here.
|
|
289
283
|
# TODO(zhwu): log the unexpected error to usage collection
|
|
290
284
|
# for future debugging.
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
'job status. Retrying.')
|
|
285
|
+
logger.info(f'Unexpected exception: {e}\nFailed to get the '
|
|
286
|
+
'job status. Retrying.')
|
|
294
287
|
continue
|
|
295
288
|
|
|
296
289
|
# Check the job status until it is not in initialized status
|
|
@@ -306,9 +299,8 @@ class StrategyExecutor:
|
|
|
306
299
|
except Exception as e: # pylint: disable=broad-except
|
|
307
300
|
# If we failed to get the job timestamp, we will retry
|
|
308
301
|
# job checking loop.
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
'the job start timestamp. Retrying.')
|
|
302
|
+
logger.info(f'Unexpected Exception: {e}\nFailed to get '
|
|
303
|
+
'the job start timestamp. Retrying.')
|
|
312
304
|
continue
|
|
313
305
|
# Wait for the job to be started
|
|
314
306
|
await asyncio.sleep(
|
|
@@ -370,7 +362,6 @@ class StrategyExecutor:
|
|
|
370
362
|
self.starting,
|
|
371
363
|
self.starting_lock,
|
|
372
364
|
self.starting_signal,
|
|
373
|
-
self._logger,
|
|
374
365
|
):
|
|
375
366
|
# The job state may have been PENDING during backoff -
|
|
376
367
|
# update to STARTING or RECOVERING.
|
|
@@ -394,21 +385,19 @@ class StrategyExecutor:
|
|
|
394
385
|
for env_var in ENV_VARS_TO_CLEAR:
|
|
395
386
|
vars_to_restore[env_var] = os.environ.pop(
|
|
396
387
|
env_var, None)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
388
|
+
logger.debug('Cleared env var: '
|
|
389
|
+
f'{env_var}')
|
|
390
|
+
logger.debug('Env vars for api_start: '
|
|
391
|
+
f'{os.environ}')
|
|
401
392
|
await context_utils.to_thread(sdk.api_start)
|
|
402
|
-
|
|
393
|
+
logger.info('API server started.')
|
|
403
394
|
finally:
|
|
404
395
|
for env_var, value in vars_to_restore.items():
|
|
405
396
|
if value is not None:
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
f'{env_var}: {value}')
|
|
397
|
+
logger.debug('Restored env var: '
|
|
398
|
+
f'{env_var}: {value}')
|
|
409
399
|
os.environ[env_var] = value
|
|
410
400
|
|
|
411
|
-
log_file = _get_logger_file(self._logger)
|
|
412
401
|
request_id = None
|
|
413
402
|
try:
|
|
414
403
|
request_id = await context_utils.to_thread(
|
|
@@ -429,31 +418,27 @@ class StrategyExecutor:
|
|
|
429
418
|
# down=True,
|
|
430
419
|
_is_launched_by_jobs_controller=True,
|
|
431
420
|
)
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
sdk.stream_and_get,
|
|
439
|
-
request_id,
|
|
440
|
-
output_stream=f,
|
|
441
|
-
)
|
|
421
|
+
logger.debug('sdk.launch request ID: '
|
|
422
|
+
f'{request_id}')
|
|
423
|
+
await context_utils.to_thread(
|
|
424
|
+
sdk.stream_and_get,
|
|
425
|
+
request_id,
|
|
426
|
+
)
|
|
442
427
|
except asyncio.CancelledError:
|
|
443
428
|
if request_id:
|
|
444
429
|
req = await context_utils.to_thread(
|
|
445
430
|
sdk.api_cancel, request_id)
|
|
446
|
-
|
|
447
|
-
|
|
431
|
+
logger.debug('sdk.api_cancel request '
|
|
432
|
+
f'ID: {req}')
|
|
448
433
|
try:
|
|
449
434
|
await context_utils.to_thread(
|
|
450
435
|
sdk.get, req)
|
|
451
436
|
except Exception as e: # pylint: disable=broad-except
|
|
452
437
|
# we must still return a CancelledError
|
|
453
|
-
|
|
438
|
+
logger.error(
|
|
454
439
|
f'Failed to cancel the job: {e}')
|
|
455
440
|
raise
|
|
456
|
-
|
|
441
|
+
logger.info('Managed job cluster launched.')
|
|
457
442
|
else:
|
|
458
443
|
self.cluster_name = await (context_utils.to_thread(
|
|
459
444
|
serve_utils.get_next_cluster_name, self.pool,
|
|
@@ -468,8 +453,8 @@ class StrategyExecutor:
|
|
|
468
453
|
self.dag,
|
|
469
454
|
cluster_name=self.cluster_name,
|
|
470
455
|
)
|
|
471
|
-
|
|
472
|
-
|
|
456
|
+
logger.debug('sdk.exec request ID: '
|
|
457
|
+
f'{request_id}')
|
|
473
458
|
job_id_on_pool_cluster, _ = (
|
|
474
459
|
await context_utils.to_thread(
|
|
475
460
|
sdk.get, request_id))
|
|
@@ -477,14 +462,14 @@ class StrategyExecutor:
|
|
|
477
462
|
if request_id:
|
|
478
463
|
req = await context_utils.to_thread(
|
|
479
464
|
sdk.api_cancel, request_id)
|
|
480
|
-
|
|
481
|
-
|
|
465
|
+
logger.debug('sdk.api_cancel request '
|
|
466
|
+
f'ID: {req}')
|
|
482
467
|
try:
|
|
483
468
|
await context_utils.to_thread(
|
|
484
469
|
sdk.get, req)
|
|
485
470
|
except Exception as e: # pylint: disable=broad-except
|
|
486
471
|
# we must still return a CancelledError
|
|
487
|
-
|
|
472
|
+
logger.error(
|
|
488
473
|
f'Failed to cancel the job: {e}')
|
|
489
474
|
raise
|
|
490
475
|
assert job_id_on_pool_cluster is not None, (
|
|
@@ -492,15 +477,14 @@ class StrategyExecutor:
|
|
|
492
477
|
self.job_id_on_pool_cluster = job_id_on_pool_cluster
|
|
493
478
|
await state.set_job_id_on_pool_cluster_async(
|
|
494
479
|
self.job_id, job_id_on_pool_cluster)
|
|
495
|
-
|
|
480
|
+
logger.info('Managed job cluster launched.')
|
|
496
481
|
except (exceptions.InvalidClusterNameError,
|
|
497
482
|
exceptions.NoCloudAccessError,
|
|
498
483
|
exceptions.ResourcesMismatchError,
|
|
499
484
|
exceptions.StorageSpecError,
|
|
500
485
|
exceptions.StorageError) as e:
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
f'{common_utils.format_exception(e)}')
|
|
486
|
+
logger.error('Failure happened before provisioning. '
|
|
487
|
+
f'{common_utils.format_exception(e)}')
|
|
504
488
|
if raise_on_failure:
|
|
505
489
|
raise exceptions.ProvisionPrechecksError(
|
|
506
490
|
reasons=[e])
|
|
@@ -528,24 +512,22 @@ class StrategyExecutor:
|
|
|
528
512
|
reasons_str = '; '.join(
|
|
529
513
|
common_utils.format_exception(err)
|
|
530
514
|
for err in reasons)
|
|
531
|
-
|
|
515
|
+
logger.error(
|
|
532
516
|
'Failure happened before provisioning. '
|
|
533
517
|
f'Failover reasons: {reasons_str}')
|
|
534
518
|
if raise_on_failure:
|
|
535
519
|
raise exceptions.ProvisionPrechecksError(
|
|
536
520
|
reasons)
|
|
537
521
|
return None
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
f'{common_utils.format_exception(e)})')
|
|
522
|
+
logger.info('Failed to launch a cluster with error: '
|
|
523
|
+
f'{common_utils.format_exception(e)})')
|
|
541
524
|
except Exception as e: # pylint: disable=broad-except
|
|
542
525
|
# If the launch fails, it will be recovered by the
|
|
543
526
|
# following code.
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
f'{common_utils.format_exception(e)})')
|
|
527
|
+
logger.info('Failed to launch a cluster with error: '
|
|
528
|
+
f'{common_utils.format_exception(e)})')
|
|
547
529
|
with ux_utils.enable_traceback():
|
|
548
|
-
|
|
530
|
+
logger.info(
|
|
549
531
|
f' Traceback: {traceback.format_exc()}')
|
|
550
532
|
else: # No exception, the launch succeeds.
|
|
551
533
|
# At this point, a sky.launch() has succeeded. Cluster
|
|
@@ -559,7 +541,7 @@ class StrategyExecutor:
|
|
|
559
541
|
# launch.
|
|
560
542
|
# TODO(zhwu): log the unexpected error to usage
|
|
561
543
|
# collection for future debugging.
|
|
562
|
-
|
|
544
|
+
logger.info(
|
|
563
545
|
'Failed to successfully submit the job to the '
|
|
564
546
|
'launched cluster, due to unexpected submission '
|
|
565
547
|
'errors or the cluster being preempted during '
|
|
@@ -594,8 +576,8 @@ class StrategyExecutor:
|
|
|
594
576
|
# Calculate the backoff time and sleep.
|
|
595
577
|
gap_seconds = (backoff.current_backoff()
|
|
596
578
|
if self.pool is None else 1)
|
|
597
|
-
|
|
598
|
-
|
|
579
|
+
logger.info('Retrying to launch the cluster in '
|
|
580
|
+
f'{gap_seconds:.1f} seconds.')
|
|
599
581
|
await asyncio.sleep(gap_seconds)
|
|
600
582
|
continue
|
|
601
583
|
else:
|
|
@@ -630,15 +612,14 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
630
612
|
max_restarts_on_errors: int,
|
|
631
613
|
job_id: int,
|
|
632
614
|
task_id: int,
|
|
633
|
-
job_logger: logging.Logger,
|
|
634
615
|
pool: Optional[str],
|
|
635
616
|
starting: Set[int],
|
|
636
617
|
starting_lock: asyncio.Lock,
|
|
637
618
|
starting_signal: asyncio.Condition,
|
|
638
619
|
) -> None:
|
|
639
620
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
|
640
|
-
job_id, task_id,
|
|
641
|
-
|
|
621
|
+
job_id, task_id, pool, starting, starting_lock,
|
|
622
|
+
starting_signal)
|
|
642
623
|
# Note down the cloud/region of the launched cluster, so that we can
|
|
643
624
|
# first retry in the same cloud/region. (Inside recover() we may not
|
|
644
625
|
# rely on cluster handle, as it can be None if the cluster is
|
|
@@ -694,14 +675,13 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
694
675
|
return job_submitted_at
|
|
695
676
|
|
|
696
677
|
# Step 2
|
|
697
|
-
|
|
698
|
-
|
|
678
|
+
logger.debug('Terminating unhealthy cluster and reset cloud '
|
|
679
|
+
'region.')
|
|
699
680
|
await context_utils.to_thread(self._cleanup_cluster)
|
|
700
681
|
|
|
701
682
|
# Step 3
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
'cloud/region.')
|
|
683
|
+
logger.debug('Relaunch the cluster without constraining to prior '
|
|
684
|
+
'cloud/region.')
|
|
705
685
|
# Not using self.launch to avoid the retry until up logic.
|
|
706
686
|
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
707
687
|
raise_on_failure=False,
|
|
@@ -709,8 +689,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
709
689
|
if job_submitted_at is None:
|
|
710
690
|
# Failed to launch the cluster.
|
|
711
691
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
712
|
-
|
|
713
|
-
|
|
692
|
+
logger.info('Retrying to recover the cluster in '
|
|
693
|
+
f'{gap_seconds:.1f} seconds.')
|
|
714
694
|
await asyncio.sleep(gap_seconds)
|
|
715
695
|
continue
|
|
716
696
|
|
|
@@ -755,14 +735,12 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
755
735
|
# task.resources.
|
|
756
736
|
|
|
757
737
|
# Step 1
|
|
758
|
-
|
|
759
|
-
'Terminating unhealthy cluster and reset cloud region.')
|
|
738
|
+
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
|
760
739
|
await context_utils.to_thread(self._cleanup_cluster)
|
|
761
740
|
|
|
762
741
|
# Step 2
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
'cloud/region.')
|
|
742
|
+
logger.debug('Relaunch the cluster skipping the previously launched '
|
|
743
|
+
'cloud/region.')
|
|
766
744
|
if self._launched_resources is not None:
|
|
767
745
|
task = self.dag.tasks[0]
|
|
768
746
|
requested_resources = self._launched_resources
|
|
@@ -787,9 +765,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
787
765
|
|
|
788
766
|
while True:
|
|
789
767
|
# Step 3
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
'cloud/region.')
|
|
768
|
+
logger.debug('Relaunch the cluster without constraining to prior '
|
|
769
|
+
'cloud/region.')
|
|
793
770
|
# Not using self.launch to avoid the retry until up logic.
|
|
794
771
|
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
795
772
|
raise_on_failure=False,
|
|
@@ -797,8 +774,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
797
774
|
if job_submitted_at is None:
|
|
798
775
|
# Failed to launch the cluster.
|
|
799
776
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
800
|
-
|
|
801
|
-
|
|
777
|
+
logger.info('Retrying to recover the cluster in '
|
|
778
|
+
f'{gap_seconds:.1f} seconds.')
|
|
802
779
|
await asyncio.sleep(gap_seconds)
|
|
803
780
|
continue
|
|
804
781
|
|