skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
"""Controller: handles scheduling and the life cycle of a managed job.
|
|
2
2
|
"""
|
|
3
3
|
import asyncio
|
|
4
|
-
import
|
|
4
|
+
import io
|
|
5
5
|
import os
|
|
6
6
|
import pathlib
|
|
7
7
|
import resource
|
|
8
8
|
import shutil
|
|
9
9
|
import sys
|
|
10
|
+
import threading
|
|
10
11
|
import time
|
|
11
12
|
import traceback
|
|
12
13
|
import typing
|
|
13
|
-
from typing import Dict, Optional, Set
|
|
14
|
+
from typing import Dict, Optional, Set
|
|
14
15
|
|
|
15
16
|
import dotenv
|
|
16
17
|
|
|
@@ -23,6 +24,8 @@ from sky.backends import backend_utils
|
|
|
23
24
|
from sky.backends import cloud_vm_ray_backend
|
|
24
25
|
from sky.data import data_utils
|
|
25
26
|
from sky.jobs import constants as jobs_constants
|
|
27
|
+
from sky.jobs import file_content_utils
|
|
28
|
+
from sky.jobs import log_gc
|
|
26
29
|
from sky.jobs import recovery_strategy
|
|
27
30
|
from sky.jobs import scheduler
|
|
28
31
|
from sky.jobs import state as managed_job_state
|
|
@@ -30,6 +33,7 @@ from sky.jobs import utils as managed_job_utils
|
|
|
30
33
|
from sky.skylet import constants
|
|
31
34
|
from sky.skylet import job_lib
|
|
32
35
|
from sky.usage import usage_lib
|
|
36
|
+
from sky.utils import annotations
|
|
33
37
|
from sky.utils import common
|
|
34
38
|
from sky.utils import common_utils
|
|
35
39
|
from sky.utils import context
|
|
@@ -62,17 +66,26 @@ async def create_background_task(coro: typing.Coroutine) -> None:
|
|
|
62
66
|
task.add_done_callback(_background_tasks.discard)
|
|
63
67
|
|
|
64
68
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
69
|
+
# Make sure to limit the size as we don't want to cache too many DAGs in memory.
|
|
70
|
+
@annotations.lru_cache(scope='global', maxsize=50)
|
|
71
|
+
def _get_dag(job_id: int) -> 'sky.Dag':
|
|
72
|
+
dag_content = file_content_utils.get_job_dag_content(job_id)
|
|
73
|
+
if dag_content is None:
|
|
74
|
+
raise RuntimeError('Managed job DAG YAML content is unavailable for '
|
|
75
|
+
f'job {job_id}. This can happen if the job was '
|
|
76
|
+
'submitted before file migration completed or if '
|
|
77
|
+
'the submission failed to persist the DAG. Please '
|
|
78
|
+
're-submit the job.')
|
|
70
79
|
|
|
80
|
+
dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
|
|
81
|
+
assert dag.name is not None, dag
|
|
82
|
+
return dag
|
|
71
83
|
|
|
72
|
-
|
|
84
|
+
|
|
85
|
+
class JobController:
|
|
73
86
|
"""Controls the lifecycle of a single managed job.
|
|
74
87
|
|
|
75
|
-
This controller executes
|
|
88
|
+
This controller executes the chain DAG recorded for the job by:
|
|
76
89
|
- Loading the DAG and preparing per-task environment variables so each task
|
|
77
90
|
has a stable global job identifier across recoveries.
|
|
78
91
|
- Launching the task on the configured backend (``CloudVmRayBackend``),
|
|
@@ -92,10 +105,10 @@ class JobsController:
|
|
|
92
105
|
|
|
93
106
|
Key attributes:
|
|
94
107
|
- ``_job_id``: Integer identifier of this managed job.
|
|
95
|
-
- ``
|
|
108
|
+
- ``_dag`` / ``_dag_name``: The job definition and metadata loaded from the
|
|
109
|
+
database-backed job YAML.
|
|
96
110
|
- ``_backend``: Backend used to launch and manage clusters.
|
|
97
111
|
- ``_pool``: Optional pool name if using a cluster pool.
|
|
98
|
-
- ``_logger``: Job-scoped logger for progress and diagnostics.
|
|
99
112
|
- ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
|
|
100
113
|
coordination primitives. ``starting_lock`` must be used for accessing
|
|
101
114
|
``starting_signal`` and ``starting``
|
|
@@ -106,8 +119,6 @@ class JobsController:
|
|
|
106
119
|
def __init__(
|
|
107
120
|
self,
|
|
108
121
|
job_id: int,
|
|
109
|
-
dag_yaml: str,
|
|
110
|
-
job_logger: logging.Logger,
|
|
111
122
|
starting: Set[int],
|
|
112
123
|
starting_lock: asyncio.Lock,
|
|
113
124
|
starting_signal: asyncio.Condition,
|
|
@@ -117,8 +128,6 @@ class JobsController:
|
|
|
117
128
|
|
|
118
129
|
Args:
|
|
119
130
|
job_id: Integer ID of the managed job.
|
|
120
|
-
dag_yaml: Path to the YAML file containing the chain DAG to run.
|
|
121
|
-
job_logger: Logger instance dedicated to this job.
|
|
122
131
|
starting: Shared set of job IDs currently in the STARTING phase,
|
|
123
132
|
used to limit concurrent launches.
|
|
124
133
|
starting_lock: ``asyncio.Lock`` guarding access to the shared
|
|
@@ -134,14 +143,12 @@ class JobsController:
|
|
|
134
143
|
self.starting_lock = starting_lock
|
|
135
144
|
self.starting_signal = starting_signal
|
|
136
145
|
|
|
137
|
-
|
|
138
|
-
self._logger.info(f'Initializing JobsController for job_id={job_id}, '
|
|
139
|
-
f'dag_yaml={dag_yaml}')
|
|
146
|
+
logger.info('Initializing JobsController for job_id=%s', job_id)
|
|
140
147
|
|
|
141
148
|
self._job_id = job_id
|
|
142
|
-
self.
|
|
143
|
-
self.
|
|
144
|
-
|
|
149
|
+
self._dag = _get_dag(job_id)
|
|
150
|
+
self._dag_name = self._dag.name
|
|
151
|
+
logger.info(f'Loaded DAG: {self._dag}')
|
|
145
152
|
|
|
146
153
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
147
154
|
self._pool = pool
|
|
@@ -191,8 +198,8 @@ class JobsController:
|
|
|
191
198
|
preemptions or ssh disconnection during the streaming.
|
|
192
199
|
"""
|
|
193
200
|
if handle is None:
|
|
194
|
-
|
|
195
|
-
|
|
201
|
+
logger.info(f'Cluster for job {self._job_id} is not found. '
|
|
202
|
+
'Skipping downloading and streaming the logs.')
|
|
196
203
|
return
|
|
197
204
|
|
|
198
205
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
@@ -210,11 +217,11 @@ class JobsController:
|
|
|
210
217
|
managed_job_state.set_local_log_file(self._job_id, task_id,
|
|
211
218
|
log_file)
|
|
212
219
|
else:
|
|
213
|
-
|
|
220
|
+
logger.warning(
|
|
214
221
|
f'No log file was downloaded for job {self._job_id}, '
|
|
215
222
|
f'task {task_id}')
|
|
216
223
|
|
|
217
|
-
|
|
224
|
+
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
|
218
225
|
|
|
219
226
|
async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
|
|
220
227
|
if cluster_name is None:
|
|
@@ -259,7 +266,7 @@ class JobsController:
|
|
|
259
266
|
Other exceptions may be raised depending on the backend.
|
|
260
267
|
"""
|
|
261
268
|
task_start_time = time.time()
|
|
262
|
-
|
|
269
|
+
logger.info(
|
|
263
270
|
f'Starting task {task_id} ({task.name}) for job {self._job_id}')
|
|
264
271
|
|
|
265
272
|
latest_task_id, last_task_prev_status = (
|
|
@@ -271,22 +278,20 @@ class JobsController:
|
|
|
271
278
|
managed_job_state.ManagedJobStatus.PENDING):
|
|
272
279
|
assert latest_task_id >= task_id, (latest_task_id, task_id)
|
|
273
280
|
if latest_task_id > task_id:
|
|
274
|
-
|
|
275
|
-
|
|
281
|
+
logger.info(f'Task {task_id} ({task.name}) has already '
|
|
282
|
+
'been executed. Skipping...')
|
|
276
283
|
return True
|
|
277
284
|
if latest_task_id == task_id:
|
|
278
285
|
# Start recovery.
|
|
279
286
|
is_resume = True
|
|
280
|
-
|
|
281
|
-
f'Resuming task {task_id} from previous execution')
|
|
287
|
+
logger.info(f'Resuming task {task_id} from previous execution')
|
|
282
288
|
|
|
283
289
|
callback_func = managed_job_utils.event_callback_func(
|
|
284
290
|
job_id=self._job_id, task_id=task_id, task=task)
|
|
285
291
|
|
|
286
292
|
if task.run is None:
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
'run commands being empty.')
|
|
293
|
+
logger.info(f'Skip running task {task_id} ({task.name}) due to its '
|
|
294
|
+
'run commands being empty.')
|
|
290
295
|
# Call set_started first to initialize columns in the state table,
|
|
291
296
|
# including start_at and last_recovery_at to avoid issues for
|
|
292
297
|
# uninitialized columns.
|
|
@@ -300,8 +305,7 @@ class JobsController:
|
|
|
300
305
|
task_id=task_id,
|
|
301
306
|
end_time=time.time(),
|
|
302
307
|
callback_func=callback_func)
|
|
303
|
-
|
|
304
|
-
f'Empty task {task_id} marked as succeeded immediately')
|
|
308
|
+
logger.info(f'Empty task {task_id} marked as succeeded immediately')
|
|
305
309
|
return True
|
|
306
310
|
|
|
307
311
|
usage_lib.messages.usage.update_task_id(task_id)
|
|
@@ -314,8 +318,7 @@ class JobsController:
|
|
|
314
318
|
task.name, self._job_id) if self._pool is None else None
|
|
315
319
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
|
316
320
|
cluster_name, self._backend, task, self._job_id, task_id,
|
|
317
|
-
self.
|
|
318
|
-
self.starting_signal)
|
|
321
|
+
self._pool, self.starting, self.starting_lock, self.starting_signal)
|
|
319
322
|
if not is_resume:
|
|
320
323
|
submitted_at = time.time()
|
|
321
324
|
if task_id == 0:
|
|
@@ -336,11 +339,11 @@ class JobsController:
|
|
|
336
339
|
self._strategy_executor.max_restarts_on_errors
|
|
337
340
|
},
|
|
338
341
|
callback_func=callback_func)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
+
logger.info(f'Submitted managed job {self._job_id} '
|
|
343
|
+
f'(task: {task_id}, name: {task.name!r}); '
|
|
344
|
+
f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
|
342
345
|
|
|
343
|
-
|
|
346
|
+
logger.info('Started monitoring.')
|
|
344
347
|
|
|
345
348
|
# Only do the initial cluster launch if not resuming from a controller
|
|
346
349
|
# failure. Otherwise, we will transit to recovering immediately.
|
|
@@ -354,7 +357,7 @@ class JobsController:
|
|
|
354
357
|
remote_job_submitted_at = await self._strategy_executor.launch()
|
|
355
358
|
|
|
356
359
|
launch_time = time.time() - launch_start
|
|
357
|
-
|
|
360
|
+
logger.info(f'Cluster launch completed in {launch_time:.2f}s')
|
|
358
361
|
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
|
359
362
|
if self._pool is None:
|
|
360
363
|
job_id_on_pool_cluster = None
|
|
@@ -367,16 +370,16 @@ class JobsController:
|
|
|
367
370
|
# Check if we have been cancelled here, in the case where a user
|
|
368
371
|
# quickly cancels the job we want to gracefully handle it here,
|
|
369
372
|
# otherwise we will end up in the FAILED_CONTROLLER state.
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
+
logger.info(f'Cluster name is None for job {self._job_id}, '
|
|
374
|
+
f'task {task_id}. Checking if we have been '
|
|
375
|
+
'cancelled.')
|
|
373
376
|
status = await (managed_job_state.get_job_status_with_task_id_async(
|
|
374
377
|
job_id=self._job_id, task_id=task_id))
|
|
375
|
-
|
|
376
|
-
|
|
378
|
+
logger.debug(f'Status for job {self._job_id}, task {task_id}:'
|
|
379
|
+
f'{status}')
|
|
377
380
|
if status == managed_job_state.ManagedJobStatus.CANCELLED:
|
|
378
|
-
|
|
379
|
-
|
|
381
|
+
logger.info(f'Job {self._job_id}, task {task_id} has '
|
|
382
|
+
'been quickly cancelled.')
|
|
380
383
|
raise asyncio.CancelledError()
|
|
381
384
|
assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
|
|
382
385
|
|
|
@@ -417,7 +420,7 @@ class JobsController:
|
|
|
417
420
|
|
|
418
421
|
if prev_status is not None:
|
|
419
422
|
if prev_status.is_terminal():
|
|
420
|
-
|
|
423
|
+
logger.info(
|
|
421
424
|
f'Task {task_id} already in terminal state: '
|
|
422
425
|
f'{prev_status}')
|
|
423
426
|
return (prev_status ==
|
|
@@ -427,9 +430,8 @@ class JobsController:
|
|
|
427
430
|
# If the controller is down when cancelling the job,
|
|
428
431
|
# we re-raise the error to run the `_cleanup` function
|
|
429
432
|
# again to clean up any remaining resources.
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
're-raising cancellation')
|
|
433
|
+
logger.info(f'Task {task_id} was being cancelled, '
|
|
434
|
+
're-raising cancellation')
|
|
433
435
|
raise asyncio.CancelledError()
|
|
434
436
|
if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
|
|
435
437
|
force_transit_to_recovering = True
|
|
@@ -443,10 +445,9 @@ class JobsController:
|
|
|
443
445
|
try:
|
|
444
446
|
await backend_utils.async_check_network_connection()
|
|
445
447
|
except exceptions.NetworkError:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
'seconds.')
|
|
448
|
+
logger.info('Network is not available. Retrying again in '
|
|
449
|
+
f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
|
|
450
|
+
'seconds.')
|
|
450
451
|
continue
|
|
451
452
|
|
|
452
453
|
# NOTE: we do not check cluster status first because race condition
|
|
@@ -461,23 +462,22 @@ class JobsController:
|
|
|
461
462
|
self._backend,
|
|
462
463
|
cluster_name,
|
|
463
464
|
job_id=job_id_on_pool_cluster,
|
|
464
|
-
job_logger=self._logger,
|
|
465
465
|
)
|
|
466
466
|
except exceptions.FetchClusterInfoError as fetch_e:
|
|
467
|
-
|
|
467
|
+
logger.info(
|
|
468
468
|
'Failed to fetch the job status. Start recovery.\n'
|
|
469
469
|
f'Exception: {common_utils.format_exception(fetch_e)}\n'
|
|
470
470
|
f'Traceback: {traceback.format_exc()}')
|
|
471
471
|
|
|
472
472
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
|
473
|
-
|
|
474
|
-
|
|
473
|
+
logger.info(f'Task {task_id} succeeded! '
|
|
474
|
+
'Getting end time and cleaning up')
|
|
475
475
|
try:
|
|
476
476
|
success_end_time = await context_utils.to_thread(
|
|
477
477
|
managed_job_utils.try_to_get_job_end_time,
|
|
478
478
|
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
479
479
|
except Exception as e: # pylint: disable=broad-except
|
|
480
|
-
|
|
480
|
+
logger.warning(
|
|
481
481
|
f'Failed to get job end time: '
|
|
482
482
|
f'{common_utils.format_exception(e)}',
|
|
483
483
|
exc_info=True)
|
|
@@ -490,7 +490,7 @@ class JobsController:
|
|
|
490
490
|
task_id,
|
|
491
491
|
end_time=success_end_time,
|
|
492
492
|
callback_func=callback_func)
|
|
493
|
-
|
|
493
|
+
logger.info(
|
|
494
494
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
|
495
495
|
f'Cleaning up the cluster {cluster_name}.')
|
|
496
496
|
try:
|
|
@@ -511,7 +511,7 @@ class JobsController:
|
|
|
511
511
|
job_id_on_pool_cluster)
|
|
512
512
|
except Exception as e: # pylint: disable=broad-except
|
|
513
513
|
# We don't want to crash here, so just log and continue.
|
|
514
|
-
|
|
514
|
+
logger.warning(
|
|
515
515
|
f'Failed to download and stream logs: '
|
|
516
516
|
f'{common_utils.format_exception(e)}',
|
|
517
517
|
exc_info=True)
|
|
@@ -521,10 +521,10 @@ class JobsController:
|
|
|
521
521
|
|
|
522
522
|
task_total_time = time.time() - task_start_time
|
|
523
523
|
monitoring_time = time.time() - monitoring_start_time
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
524
|
+
logger.info(f'Task {task_id} completed successfully in '
|
|
525
|
+
f'{task_total_time:.2f}s '
|
|
526
|
+
f'(monitoring time: {monitoring_time:.2f}s, '
|
|
527
|
+
f'status checks: {status_check_count})')
|
|
528
528
|
return True
|
|
529
529
|
|
|
530
530
|
# For single-node jobs, non-terminated job_status indicates a
|
|
@@ -560,7 +560,7 @@ class JobsController:
|
|
|
560
560
|
# code).
|
|
561
561
|
cluster_status_str = ('' if cluster_status is None else
|
|
562
562
|
f' (status: {cluster_status.value})')
|
|
563
|
-
|
|
563
|
+
logger.info(
|
|
564
564
|
f'Cluster is preempted or failed{cluster_status_str}. '
|
|
565
565
|
'Recovering...')
|
|
566
566
|
else:
|
|
@@ -571,12 +571,12 @@ class JobsController:
|
|
|
571
571
|
in job_lib.JobStatus.user_code_failure_states() or
|
|
572
572
|
job_status == job_lib.JobStatus.FAILED_DRIVER):
|
|
573
573
|
# The user code has probably crashed, fail immediately.
|
|
574
|
-
|
|
574
|
+
logger.info(
|
|
575
575
|
f'Task {task_id} failed with status: {job_status}')
|
|
576
576
|
end_time = await context_utils.to_thread(
|
|
577
577
|
managed_job_utils.try_to_get_job_end_time,
|
|
578
578
|
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
579
|
-
|
|
579
|
+
logger.info(
|
|
580
580
|
f'The user job failed ({job_status}). Please check the '
|
|
581
581
|
'logs below.\n'
|
|
582
582
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
|
@@ -611,7 +611,7 @@ class JobsController:
|
|
|
611
611
|
if should_restart_on_failure:
|
|
612
612
|
max_restarts = (
|
|
613
613
|
self._strategy_executor.max_restarts_on_errors)
|
|
614
|
-
|
|
614
|
+
logger.info(
|
|
615
615
|
f'User program crashed '
|
|
616
616
|
f'({managed_job_status.value}). '
|
|
617
617
|
f'Retry the job as max_restarts_on_errors is '
|
|
@@ -619,7 +619,7 @@ class JobsController:
|
|
|
619
619
|
f'[{self._strategy_executor.restart_cnt_on_failure}'
|
|
620
620
|
f'/{max_restarts}]')
|
|
621
621
|
else:
|
|
622
|
-
|
|
622
|
+
logger.info(
|
|
623
623
|
f'Task {task_id} failed and will not be retried')
|
|
624
624
|
await managed_job_state.set_failed_async(
|
|
625
625
|
self._job_id,
|
|
@@ -632,7 +632,7 @@ class JobsController:
|
|
|
632
632
|
elif job_status is not None:
|
|
633
633
|
# Either the job is cancelled (should not happen) or in some
|
|
634
634
|
# unknown new state that we do not handle.
|
|
635
|
-
|
|
635
|
+
logger.error(f'Unknown job status: {job_status}')
|
|
636
636
|
failure_reason = (
|
|
637
637
|
f'Unknown job status {job_status}. To see the details, '
|
|
638
638
|
f'run: sky jobs logs --controller {self._job_id}')
|
|
@@ -649,10 +649,9 @@ class JobsController:
|
|
|
649
649
|
# job status. Try to recover the job (will not restart the
|
|
650
650
|
# cluster, if the cluster is healthy).
|
|
651
651
|
assert job_status is None, job_status
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
'(the cluster will not be restarted).')
|
|
652
|
+
logger.info('Failed to fetch the job status while the '
|
|
653
|
+
'cluster is healthy. Try to recover the job '
|
|
654
|
+
'(the cluster will not be restarted).')
|
|
656
655
|
# When the handle is None, the cluster should be cleaned up already.
|
|
657
656
|
if handle is not None:
|
|
658
657
|
resources = handle.launched_resources
|
|
@@ -671,15 +670,14 @@ class JobsController:
|
|
|
671
670
|
# Some spot resource (e.g., Spot TPU VM) may need to be
|
|
672
671
|
# cleaned up after preemption, as running launch again on
|
|
673
672
|
# those clusters again may fail.
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
'...')
|
|
673
|
+
logger.info('Cleaning up the preempted or failed cluster'
|
|
674
|
+
'...')
|
|
677
675
|
await self._cleanup_cluster(cluster_name)
|
|
678
676
|
|
|
679
677
|
# Try to recover the managed jobs, when the cluster is preempted or
|
|
680
678
|
# failed or the job status is failed to be fetched.
|
|
681
|
-
|
|
682
|
-
|
|
679
|
+
logger.info(f'Starting recovery for task {task_id}, '
|
|
680
|
+
f'it is currently {job_status}')
|
|
683
681
|
await managed_job_state.set_recovering_async(
|
|
684
682
|
job_id=self._job_id,
|
|
685
683
|
task_id=task_id,
|
|
@@ -701,7 +699,7 @@ class JobsController:
|
|
|
701
699
|
|
|
702
700
|
async def run(self):
|
|
703
701
|
"""Run controller logic and handle exceptions."""
|
|
704
|
-
|
|
702
|
+
logger.info(f'Starting JobsController run for job {self._job_id}')
|
|
705
703
|
task_id = 0
|
|
706
704
|
cancelled = False
|
|
707
705
|
|
|
@@ -709,39 +707,36 @@ class JobsController:
|
|
|
709
707
|
succeeded = True
|
|
710
708
|
# We support chain DAGs only for now.
|
|
711
709
|
for task_id, task in enumerate(self._dag.tasks):
|
|
712
|
-
|
|
710
|
+
logger.info(
|
|
713
711
|
f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
|
|
714
712
|
f'{task.name}')
|
|
715
713
|
task_start = time.time()
|
|
716
714
|
succeeded = await self._run_one_task(task_id, task)
|
|
717
715
|
task_time = time.time() - task_start
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
f'with success={succeeded}')
|
|
716
|
+
logger.info(f'Task {task_id} completed in {task_time:.2f}s '
|
|
717
|
+
f'with success={succeeded}')
|
|
721
718
|
|
|
722
719
|
if not succeeded:
|
|
723
|
-
|
|
724
|
-
f'Task {task_id} failed, stopping execution')
|
|
720
|
+
logger.info(f'Task {task_id} failed, stopping execution')
|
|
725
721
|
break
|
|
726
722
|
|
|
727
723
|
except exceptions.ProvisionPrechecksError as e:
|
|
728
724
|
# Please refer to the docstring of self._run for the cases when
|
|
729
725
|
# this exception can occur.
|
|
730
|
-
|
|
726
|
+
logger.error(f'Provision prechecks failed for task {task_id}')
|
|
731
727
|
failure_reason = ('; '.join(
|
|
732
728
|
common_utils.format_exception(reason, use_bracket=True)
|
|
733
729
|
for reason in e.reasons))
|
|
734
|
-
|
|
730
|
+
logger.error(failure_reason)
|
|
735
731
|
await self._update_failed_task_state(
|
|
736
732
|
task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
|
|
737
733
|
failure_reason)
|
|
738
734
|
except exceptions.ManagedJobReachedMaxRetriesError as e:
|
|
739
735
|
# Please refer to the docstring of self._run for the cases when
|
|
740
736
|
# this exception can occur.
|
|
741
|
-
|
|
742
|
-
f'Managed job reached max retries for task {task_id}')
|
|
737
|
+
logger.error(f'Managed job reached max retries for task {task_id}')
|
|
743
738
|
failure_reason = common_utils.format_exception(e)
|
|
744
|
-
|
|
739
|
+
logger.error(failure_reason)
|
|
745
740
|
# The managed job should be marked as FAILED_NO_RESOURCE, as the
|
|
746
741
|
# managed job may be able to launch next time.
|
|
747
742
|
await self._update_failed_task_state(
|
|
@@ -753,13 +748,13 @@ class JobsController:
|
|
|
753
748
|
cancelled = True
|
|
754
749
|
raise
|
|
755
750
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
756
|
-
|
|
751
|
+
logger.error(
|
|
757
752
|
f'Unexpected error in JobsController run for task {task_id}')
|
|
758
753
|
with ux_utils.enable_traceback():
|
|
759
|
-
|
|
754
|
+
logger.error(traceback.format_exc())
|
|
760
755
|
msg = ('Unexpected error occurred: ' +
|
|
761
756
|
common_utils.format_exception(e, use_bracket=True))
|
|
762
|
-
|
|
757
|
+
logger.error(msg)
|
|
763
758
|
await self._update_failed_task_state(
|
|
764
759
|
task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
|
765
760
|
msg)
|
|
@@ -783,8 +778,8 @@ class JobsController:
|
|
|
783
778
|
failure_type: managed_job_state.ManagedJobStatus,
|
|
784
779
|
failure_reason: str):
|
|
785
780
|
"""Update the state of the failed task."""
|
|
786
|
-
|
|
787
|
-
|
|
781
|
+
logger.info(f'Updating failed task state: task_id={task_id}, '
|
|
782
|
+
f'failure_type={failure_type}')
|
|
788
783
|
await managed_job_state.set_failed_async(
|
|
789
784
|
self._job_id,
|
|
790
785
|
task_id=task_id,
|
|
@@ -796,10 +791,14 @@ class JobsController:
|
|
|
796
791
|
task=self._dag.tasks[task_id]))
|
|
797
792
|
|
|
798
793
|
|
|
799
|
-
class
|
|
800
|
-
"""
|
|
794
|
+
class ControllerManager:
|
|
795
|
+
"""Main loop for a job controller process.
|
|
796
|
+
|
|
797
|
+
Many jobs will be handled by this, each by a single JobController.
|
|
798
|
+
"""
|
|
801
799
|
|
|
802
|
-
def __init__(self) -> None:
|
|
800
|
+
def __init__(self, controller_uuid: str) -> None:
|
|
801
|
+
self._controller_uuid = controller_uuid
|
|
803
802
|
# Global state for active jobs
|
|
804
803
|
self.job_tasks: Dict[int, asyncio.Task] = {}
|
|
805
804
|
self.starting: Set[int] = set()
|
|
@@ -813,11 +812,9 @@ class Controller:
|
|
|
813
812
|
# launch).
|
|
814
813
|
self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
|
|
815
814
|
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
job_logger: logging.Logger,
|
|
820
|
-
pool: Optional[str] = None):
|
|
815
|
+
self._pid = os.getpid()
|
|
816
|
+
|
|
817
|
+
async def _cleanup(self, job_id: int, pool: Optional[str] = None):
|
|
821
818
|
"""Clean up the cluster(s) and storages.
|
|
822
819
|
|
|
823
820
|
(1) Clean up the succeeded task(s)' ephemeral storage. The storage has
|
|
@@ -842,14 +839,13 @@ class Controller:
|
|
|
842
839
|
cluster_name = (
|
|
843
840
|
managed_job_utils.generate_managed_job_cluster_name(
|
|
844
841
|
task.name, job_id))
|
|
845
|
-
managed_job_utils.terminate_cluster(cluster_name
|
|
846
|
-
_logger=job_logger)
|
|
842
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
|
847
843
|
status = core.status(cluster_names=[cluster_name],
|
|
848
844
|
all_users=True)
|
|
849
845
|
assert (len(status) == 0 or
|
|
850
846
|
status[0]['status'] == sky.ClusterStatus.STOPPED), (
|
|
851
847
|
f'{cluster_name} is not down: {status}')
|
|
852
|
-
|
|
848
|
+
logger.info(f'{cluster_name} is down')
|
|
853
849
|
else:
|
|
854
850
|
cluster_name, job_id_on_pool_cluster = (
|
|
855
851
|
managed_job_state.get_pool_submit_info(job_id))
|
|
@@ -860,7 +856,7 @@ class Controller:
|
|
|
860
856
|
_try_cancel_if_cluster_is_init=True)
|
|
861
857
|
except Exception as e: # pylint: disable=broad-except
|
|
862
858
|
error = e
|
|
863
|
-
|
|
859
|
+
logger.warning(
|
|
864
860
|
f'Failed to terminate cluster {cluster_name}: {e}')
|
|
865
861
|
# we continue to try cleaning up whatever else we can.
|
|
866
862
|
# Clean up Storages with persistent=False.
|
|
@@ -874,7 +870,7 @@ class Controller:
|
|
|
874
870
|
for storage in task.storage_mounts.values():
|
|
875
871
|
storage.construct()
|
|
876
872
|
except (exceptions.StorageSpecError, exceptions.StorageError) as e:
|
|
877
|
-
|
|
873
|
+
logger.warning(
|
|
878
874
|
f'Failed to construct storage object for teardown: {e}\n'
|
|
879
875
|
'This may happen because storage construction already '
|
|
880
876
|
'failed during launch, storage was deleted externally, '
|
|
@@ -884,7 +880,7 @@ class Controller:
|
|
|
884
880
|
backend.teardown_ephemeral_storage(task)
|
|
885
881
|
except Exception as e: # pylint: disable=broad-except
|
|
886
882
|
error = e
|
|
887
|
-
|
|
883
|
+
logger.warning(f'Failed to teardown ephemeral storage: {e}')
|
|
888
884
|
# we continue to try cleaning up whatever else we can.
|
|
889
885
|
|
|
890
886
|
# Clean up any files mounted from the local disk, such as two-hop
|
|
@@ -902,13 +898,13 @@ class Controller:
|
|
|
902
898
|
else:
|
|
903
899
|
os.remove(path)
|
|
904
900
|
except Exception as e: # pylint: disable=broad-except
|
|
905
|
-
|
|
901
|
+
logger.warning(
|
|
906
902
|
f'Failed to clean up file mount {file_mount}: {e}')
|
|
907
903
|
|
|
908
904
|
if error is not None:
|
|
909
905
|
raise error
|
|
910
906
|
|
|
911
|
-
dag
|
|
907
|
+
dag = _get_dag(job_id)
|
|
912
908
|
error = None
|
|
913
909
|
for task in dag.tasks:
|
|
914
910
|
# most things in this function are blocking
|
|
@@ -924,58 +920,52 @@ class Controller:
|
|
|
924
920
|
|
|
925
921
|
# Use context.contextual to enable per-job output redirection and env var
|
|
926
922
|
# isolation.
|
|
927
|
-
@context.
|
|
923
|
+
@context.contextual_async
|
|
928
924
|
async def run_job_loop(self,
|
|
929
925
|
job_id: int,
|
|
930
|
-
dag_yaml: str,
|
|
931
|
-
job_logger: logging.Logger,
|
|
932
926
|
log_file: str,
|
|
933
|
-
env_file_path: Optional[str] = None,
|
|
934
927
|
pool: Optional[str] = None):
|
|
935
928
|
"""Background task that runs the job loop."""
|
|
936
929
|
ctx = context.get()
|
|
937
930
|
assert ctx is not None, 'Context is not initialized'
|
|
938
931
|
ctx.redirect_log(pathlib.Path(log_file))
|
|
939
932
|
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
job_logger.info(f'Loading environment from {env_file_path}: '
|
|
946
|
-
f'{list(env_vars.keys())}')
|
|
933
|
+
logger.info('Starting job loop for %s', job_id)
|
|
934
|
+
logger.info(' log_file=%s', log_file)
|
|
935
|
+
logger.info(' pool=%s', pool)
|
|
936
|
+
logger.info(f'From controller {self._controller_uuid}')
|
|
937
|
+
logger.info(f' pid={self._pid}')
|
|
947
938
|
|
|
948
|
-
|
|
939
|
+
env_content = file_content_utils.get_job_env_content(job_id)
|
|
940
|
+
if env_content:
|
|
941
|
+
try:
|
|
942
|
+
env_vars = dotenv.dotenv_values(stream=io.StringIO(env_content))
|
|
943
|
+
logger.info('Loading %d environment variables for job %s',
|
|
944
|
+
len(env_vars), job_id)
|
|
949
945
|
if ctx is not None:
|
|
950
946
|
for key, value in env_vars.items():
|
|
951
947
|
if value is not None:
|
|
952
948
|
ctx.override_envs({key: value})
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
# Reload the skypilot config for this context to make sure
|
|
956
|
-
# the latest config is used.
|
|
949
|
+
logger.debug('Set environment variable: %s=%s', key,
|
|
950
|
+
value)
|
|
957
951
|
skypilot_config.reload_config()
|
|
958
|
-
else:
|
|
959
|
-
|
|
960
|
-
|
|
952
|
+
else: # pragma: no cover - defensive
|
|
953
|
+
logger.error('Context is None, cannot set environment '
|
|
954
|
+
'variables')
|
|
961
955
|
except Exception as e: # pylint: disable=broad-except
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
job_logger.error(f'Environment file not found: {env_file_path}')
|
|
956
|
+
logger.error(
|
|
957
|
+
'Failed to load environment variables for job %s: '
|
|
958
|
+
'%s', job_id, e)
|
|
966
959
|
|
|
967
960
|
cancelling = False
|
|
968
961
|
try:
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
self.starting, self._job_tasks_lock,
|
|
973
|
-
self._starting_signal, pool)
|
|
962
|
+
controller = JobController(job_id, self.starting,
|
|
963
|
+
self._job_tasks_lock,
|
|
964
|
+
self._starting_signal, pool)
|
|
974
965
|
|
|
975
966
|
async with self._job_tasks_lock:
|
|
976
967
|
if job_id in self.job_tasks:
|
|
977
|
-
|
|
978
|
-
f'Job {job_id} already exists in job_tasks')
|
|
968
|
+
logger.error(f'Job {job_id} already exists in job_tasks')
|
|
979
969
|
raise ValueError(f'Job {job_id} already exists')
|
|
980
970
|
|
|
981
971
|
# Create the task and store it
|
|
@@ -985,13 +975,13 @@ class Controller:
|
|
|
985
975
|
self.job_tasks[job_id] = task
|
|
986
976
|
await task
|
|
987
977
|
except asyncio.CancelledError:
|
|
988
|
-
|
|
989
|
-
dag
|
|
978
|
+
logger.info(f'Job {job_id} was cancelled')
|
|
979
|
+
dag = _get_dag(job_id)
|
|
990
980
|
task_id, _ = await (
|
|
991
981
|
managed_job_state.get_latest_task_id_status_async(job_id))
|
|
992
982
|
assert task_id is not None, job_id
|
|
993
|
-
|
|
994
|
-
|
|
983
|
+
logger.info(f'Cancelling managed job, job_id: {job_id}, '
|
|
984
|
+
f'task_id: {task_id}')
|
|
995
985
|
await managed_job_state.set_cancelling_async(
|
|
996
986
|
job_id=job_id,
|
|
997
987
|
callback_func=managed_job_utils.event_callback_func(
|
|
@@ -999,16 +989,13 @@ class Controller:
|
|
|
999
989
|
cancelling = True
|
|
1000
990
|
raise
|
|
1001
991
|
except Exception as e:
|
|
1002
|
-
|
|
1003
|
-
|
|
992
|
+
logger.error(f'Unexpected error in job loop for {job_id}: '
|
|
993
|
+
f'{common_utils.format_exception(e)}')
|
|
1004
994
|
raise
|
|
1005
995
|
finally:
|
|
1006
996
|
try:
|
|
1007
|
-
await self._cleanup(job_id,
|
|
1008
|
-
|
|
1009
|
-
job_logger=job_logger,
|
|
1010
|
-
pool=pool)
|
|
1011
|
-
job_logger.info(
|
|
997
|
+
await self._cleanup(job_id, pool=pool)
|
|
998
|
+
logger.info(
|
|
1012
999
|
f'Cluster of managed job {job_id} has been cleaned up.')
|
|
1013
1000
|
except Exception as e: # pylint: disable=broad-except
|
|
1014
1001
|
failure_reason = ('Failed to clean up: '
|
|
@@ -1037,7 +1024,7 @@ class Controller:
|
|
|
1037
1024
|
# The job can be non-terminal if the controller exited abnormally,
|
|
1038
1025
|
# e.g. failed to launch cluster after reaching the MAX_RETRY.
|
|
1039
1026
|
if not job_status.is_terminal():
|
|
1040
|
-
|
|
1027
|
+
logger.info(f'Previous job status: {job_status.value}')
|
|
1041
1028
|
await managed_job_state.set_failed_async(
|
|
1042
1029
|
job_id,
|
|
1043
1030
|
task_id=None,
|
|
@@ -1069,48 +1056,25 @@ class Controller:
|
|
|
1069
1056
|
async def start_job(
|
|
1070
1057
|
self,
|
|
1071
1058
|
job_id: int,
|
|
1072
|
-
dag_yaml: str,
|
|
1073
|
-
env_file_path: Optional[str] = None,
|
|
1074
1059
|
pool: Optional[str] = None,
|
|
1075
1060
|
):
|
|
1076
1061
|
"""Start a new job.
|
|
1077
1062
|
|
|
1078
1063
|
Args:
|
|
1079
1064
|
job_id: The ID of the job to start.
|
|
1080
|
-
dag_yaml: Path to the YAML file containing the DAG definition.
|
|
1081
|
-
env_file_path: Optional path to environment file for the job.
|
|
1082
1065
|
"""
|
|
1083
|
-
# Create
|
|
1066
|
+
# Create log file path for job output redirection
|
|
1084
1067
|
log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
1085
1068
|
os.makedirs(log_dir, exist_ok=True)
|
|
1086
1069
|
log_file = os.path.join(log_dir, f'{job_id}.log')
|
|
1087
1070
|
|
|
1088
|
-
|
|
1089
|
-
job_logger.setLevel(logging.DEBUG)
|
|
1090
|
-
|
|
1091
|
-
# Create file handler
|
|
1092
|
-
file_handler = logging.FileHandler(log_file)
|
|
1093
|
-
file_handler.setLevel(logging.DEBUG)
|
|
1094
|
-
|
|
1095
|
-
# Use Sky's standard formatter
|
|
1096
|
-
file_handler.setFormatter(sky_logging.FORMATTER)
|
|
1097
|
-
|
|
1098
|
-
# Add the handler to the logger
|
|
1099
|
-
job_logger.addHandler(file_handler)
|
|
1100
|
-
|
|
1101
|
-
# Prevent log propagation to avoid duplicate logs
|
|
1102
|
-
job_logger.propagate = False
|
|
1103
|
-
|
|
1104
|
-
job_logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
|
|
1105
|
-
f'env_file_path={env_file_path}')
|
|
1071
|
+
logger.info(f'Starting job {job_id} with log_file={log_file}')
|
|
1106
1072
|
|
|
1107
1073
|
async with self._job_tasks_lock:
|
|
1108
1074
|
self.starting.add(job_id)
|
|
1109
|
-
await create_background_task(
|
|
1110
|
-
self.run_job_loop(job_id, dag_yaml, job_logger, log_file,
|
|
1111
|
-
env_file_path, pool))
|
|
1075
|
+
await create_background_task(self.run_job_loop(job_id, log_file, pool))
|
|
1112
1076
|
|
|
1113
|
-
|
|
1077
|
+
logger.info(f'Job {job_id} started successfully')
|
|
1114
1078
|
|
|
1115
1079
|
async def cancel_job(self):
|
|
1116
1080
|
"""Cancel an existing job."""
|
|
@@ -1161,6 +1125,7 @@ class Controller:
|
|
|
1161
1125
|
scheduler.get_number_of_controllers()))
|
|
1162
1126
|
|
|
1163
1127
|
if len(running_tasks) >= max_jobs:
|
|
1128
|
+
logger.info('Too many jobs running, waiting for 60 seconds')
|
|
1164
1129
|
await asyncio.sleep(60)
|
|
1165
1130
|
continue
|
|
1166
1131
|
|
|
@@ -1174,12 +1139,12 @@ class Controller:
|
|
|
1174
1139
|
continue
|
|
1175
1140
|
|
|
1176
1141
|
if waiting_job is None:
|
|
1142
|
+
logger.info('No waiting job, waiting for 10 seconds')
|
|
1177
1143
|
await asyncio.sleep(10)
|
|
1178
1144
|
continue
|
|
1179
1145
|
|
|
1146
|
+
logger.info(f'Claiming job {waiting_job["job_id"]}')
|
|
1180
1147
|
job_id = waiting_job['job_id']
|
|
1181
|
-
dag_yaml_path = waiting_job['dag_yaml_path']
|
|
1182
|
-
env_file_path = waiting_job.get('env_file_path')
|
|
1183
1148
|
pool = waiting_job.get('pool', None)
|
|
1184
1149
|
|
|
1185
1150
|
cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
|
|
@@ -1199,13 +1164,15 @@ class Controller:
|
|
|
1199
1164
|
job_id=job_id, task_id=None, task=None))
|
|
1200
1165
|
continue
|
|
1201
1166
|
|
|
1202
|
-
await self.start_job(job_id,
|
|
1167
|
+
await self.start_job(job_id, pool)
|
|
1168
|
+
|
|
1203
1169
|
|
|
1170
|
+
async def main(controller_uuid: str):
|
|
1171
|
+
logger.info(f'Starting controller {controller_uuid}')
|
|
1204
1172
|
|
|
1205
|
-
async def main():
|
|
1206
1173
|
context_utils.hijack_sys_attrs()
|
|
1207
1174
|
|
|
1208
|
-
controller =
|
|
1175
|
+
controller = ControllerManager(controller_uuid)
|
|
1209
1176
|
|
|
1210
1177
|
# Will happen multiple times, who cares though
|
|
1211
1178
|
os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
|
|
@@ -1214,6 +1181,8 @@ async def main():
|
|
|
1214
1181
|
soft = None
|
|
1215
1182
|
try:
|
|
1216
1183
|
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
1184
|
+
logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
|
|
1185
|
+
logger.info(f'Increasing soft limit to {hard}')
|
|
1217
1186
|
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
|
|
1218
1187
|
except OSError as e:
|
|
1219
1188
|
logger.warning(f'Failed to increase number of files we can open: {e}\n'
|
|
@@ -1222,7 +1191,10 @@ async def main():
|
|
|
1222
1191
|
# Will loop forever, do it in the background
|
|
1223
1192
|
cancel_job_task = asyncio.create_task(controller.cancel_job())
|
|
1224
1193
|
monitor_loop_task = asyncio.create_task(controller.monitor_loop())
|
|
1225
|
-
|
|
1194
|
+
# Run the garbage collector in a dedicated daemon thread to avoid affecting
|
|
1195
|
+
# the main event loop.
|
|
1196
|
+
gc_thread = threading.Thread(target=log_gc.elect_for_log_gc, daemon=True)
|
|
1197
|
+
gc_thread.start()
|
|
1226
1198
|
try:
|
|
1227
1199
|
await asyncio.gather(cancel_job_task, monitor_loop_task)
|
|
1228
1200
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -1231,4 +1203,4 @@ async def main():
|
|
|
1231
1203
|
|
|
1232
1204
|
|
|
1233
1205
|
if __name__ == '__main__':
|
|
1234
|
-
asyncio.run(main())
|
|
1206
|
+
asyncio.run(main(sys.argv[1]))
|