skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py CHANGED
@@ -1,16 +1,17 @@
1
1
  """Controller: handles scheduling and the life cycle of a managed job.
2
2
  """
3
3
  import asyncio
4
- import logging
4
+ import io
5
5
  import os
6
6
  import pathlib
7
7
  import resource
8
8
  import shutil
9
9
  import sys
10
+ import threading
10
11
  import time
11
12
  import traceback
12
13
  import typing
13
- from typing import Dict, Optional, Set, Tuple
14
+ from typing import Dict, Optional, Set
14
15
 
15
16
  import dotenv
16
17
 
@@ -23,6 +24,8 @@ from sky.backends import backend_utils
23
24
  from sky.backends import cloud_vm_ray_backend
24
25
  from sky.data import data_utils
25
26
  from sky.jobs import constants as jobs_constants
27
+ from sky.jobs import file_content_utils
28
+ from sky.jobs import log_gc
26
29
  from sky.jobs import recovery_strategy
27
30
  from sky.jobs import scheduler
28
31
  from sky.jobs import state as managed_job_state
@@ -30,6 +33,7 @@ from sky.jobs import utils as managed_job_utils
30
33
  from sky.skylet import constants
31
34
  from sky.skylet import job_lib
32
35
  from sky.usage import usage_lib
36
+ from sky.utils import annotations
33
37
  from sky.utils import common
34
38
  from sky.utils import common_utils
35
39
  from sky.utils import context
@@ -62,17 +66,26 @@ async def create_background_task(coro: typing.Coroutine) -> None:
62
66
  task.add_done_callback(_background_tasks.discard)
63
67
 
64
68
 
65
- def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
66
- dag = dag_utils.load_chain_dag_from_yaml(dag_yaml)
67
- dag_name = dag.name
68
- assert dag_name is not None, dag
69
- return dag, dag_name
69
+ # Make sure to limit the size as we don't want to cache too many DAGs in memory.
70
+ @annotations.lru_cache(scope='global', maxsize=50)
71
+ def _get_dag(job_id: int) -> 'sky.Dag':
72
+ dag_content = file_content_utils.get_job_dag_content(job_id)
73
+ if dag_content is None:
74
+ raise RuntimeError('Managed job DAG YAML content is unavailable for '
75
+ f'job {job_id}. This can happen if the job was '
76
+ 'submitted before file migration completed or if '
77
+ 'the submission failed to persist the DAG. Please '
78
+ 're-submit the job.')
70
79
 
80
+ dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
81
+ assert dag.name is not None, dag
82
+ return dag
71
83
 
72
- class JobsController:
84
+
85
+ class JobController:
73
86
  """Controls the lifecycle of a single managed job.
74
87
 
75
- This controller executes a chain DAG defined in ``dag_yaml`` by:
88
+ This controller executes the chain DAG recorded for the job by:
76
89
  - Loading the DAG and preparing per-task environment variables so each task
77
90
  has a stable global job identifier across recoveries.
78
91
  - Launching the task on the configured backend (``CloudVmRayBackend``),
@@ -92,10 +105,10 @@ class JobsController:
92
105
 
93
106
  Key attributes:
94
107
  - ``_job_id``: Integer identifier of this managed job.
95
- - ``_dag_yaml`` / ``_dag`` / ``_dag_name``: The job definition and metadata.
108
+ - ``_dag`` / ``_dag_name``: The job definition and metadata loaded from the
109
+ database-backed job YAML.
96
110
  - ``_backend``: Backend used to launch and manage clusters.
97
111
  - ``_pool``: Optional pool name if using a cluster pool.
98
- - ``_logger``: Job-scoped logger for progress and diagnostics.
99
112
  - ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
100
113
  coordination primitives. ``starting_lock`` must be used for accessing
101
114
  ``starting_signal`` and ``starting``
@@ -106,8 +119,6 @@ class JobsController:
106
119
  def __init__(
107
120
  self,
108
121
  job_id: int,
109
- dag_yaml: str,
110
- job_logger: logging.Logger,
111
122
  starting: Set[int],
112
123
  starting_lock: asyncio.Lock,
113
124
  starting_signal: asyncio.Condition,
@@ -117,8 +128,6 @@ class JobsController:
117
128
 
118
129
  Args:
119
130
  job_id: Integer ID of the managed job.
120
- dag_yaml: Path to the YAML file containing the chain DAG to run.
121
- job_logger: Logger instance dedicated to this job.
122
131
  starting: Shared set of job IDs currently in the STARTING phase,
123
132
  used to limit concurrent launches.
124
133
  starting_lock: ``asyncio.Lock`` guarding access to the shared
@@ -134,14 +143,12 @@ class JobsController:
134
143
  self.starting_lock = starting_lock
135
144
  self.starting_signal = starting_signal
136
145
 
137
- self._logger = job_logger
138
- self._logger.info(f'Initializing JobsController for job_id={job_id}, '
139
- f'dag_yaml={dag_yaml}')
146
+ logger.info('Initializing JobsController for job_id=%s', job_id)
140
147
 
141
148
  self._job_id = job_id
142
- self._dag_yaml = dag_yaml
143
- self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
144
- self._logger.info(f'Loaded DAG: {self._dag}')
149
+ self._dag = _get_dag(job_id)
150
+ self._dag_name = self._dag.name
151
+ logger.info(f'Loaded DAG: {self._dag}')
145
152
 
146
153
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
147
154
  self._pool = pool
@@ -191,8 +198,8 @@ class JobsController:
191
198
  preemptions or ssh disconnection during the streaming.
192
199
  """
193
200
  if handle is None:
194
- self._logger.info(f'Cluster for job {self._job_id} is not found. '
195
- 'Skipping downloading and streaming the logs.')
201
+ logger.info(f'Cluster for job {self._job_id} is not found. '
202
+ 'Skipping downloading and streaming the logs.')
196
203
  return
197
204
 
198
205
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
@@ -210,11 +217,11 @@ class JobsController:
210
217
  managed_job_state.set_local_log_file(self._job_id, task_id,
211
218
  log_file)
212
219
  else:
213
- self._logger.warning(
220
+ logger.warning(
214
221
  f'No log file was downloaded for job {self._job_id}, '
215
222
  f'task {task_id}')
216
223
 
217
- self._logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
224
+ logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
218
225
 
219
226
  async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
220
227
  if cluster_name is None:
@@ -259,7 +266,7 @@ class JobsController:
259
266
  Other exceptions may be raised depending on the backend.
260
267
  """
261
268
  task_start_time = time.time()
262
- self._logger.info(
269
+ logger.info(
263
270
  f'Starting task {task_id} ({task.name}) for job {self._job_id}')
264
271
 
265
272
  latest_task_id, last_task_prev_status = (
@@ -271,22 +278,20 @@ class JobsController:
271
278
  managed_job_state.ManagedJobStatus.PENDING):
272
279
  assert latest_task_id >= task_id, (latest_task_id, task_id)
273
280
  if latest_task_id > task_id:
274
- self._logger.info(f'Task {task_id} ({task.name}) has already '
275
- 'been executed. Skipping...')
281
+ logger.info(f'Task {task_id} ({task.name}) has already '
282
+ 'been executed. Skipping...')
276
283
  return True
277
284
  if latest_task_id == task_id:
278
285
  # Start recovery.
279
286
  is_resume = True
280
- self._logger.info(
281
- f'Resuming task {task_id} from previous execution')
287
+ logger.info(f'Resuming task {task_id} from previous execution')
282
288
 
283
289
  callback_func = managed_job_utils.event_callback_func(
284
290
  job_id=self._job_id, task_id=task_id, task=task)
285
291
 
286
292
  if task.run is None:
287
- self._logger.info(
288
- f'Skip running task {task_id} ({task.name}) due to its '
289
- 'run commands being empty.')
293
+ logger.info(f'Skip running task {task_id} ({task.name}) due to its '
294
+ 'run commands being empty.')
290
295
  # Call set_started first to initialize columns in the state table,
291
296
  # including start_at and last_recovery_at to avoid issues for
292
297
  # uninitialized columns.
@@ -300,8 +305,7 @@ class JobsController:
300
305
  task_id=task_id,
301
306
  end_time=time.time(),
302
307
  callback_func=callback_func)
303
- self._logger.info(
304
- f'Empty task {task_id} marked as succeeded immediately')
308
+ logger.info(f'Empty task {task_id} marked as succeeded immediately')
305
309
  return True
306
310
 
307
311
  usage_lib.messages.usage.update_task_id(task_id)
@@ -314,8 +318,7 @@ class JobsController:
314
318
  task.name, self._job_id) if self._pool is None else None
315
319
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
316
320
  cluster_name, self._backend, task, self._job_id, task_id,
317
- self._logger, self._pool, self.starting, self.starting_lock,
318
- self.starting_signal)
321
+ self._pool, self.starting, self.starting_lock, self.starting_signal)
319
322
  if not is_resume:
320
323
  submitted_at = time.time()
321
324
  if task_id == 0:
@@ -336,11 +339,11 @@ class JobsController:
336
339
  self._strategy_executor.max_restarts_on_errors
337
340
  },
338
341
  callback_func=callback_func)
339
- self._logger.info(f'Submitted managed job {self._job_id} '
340
- f'(task: {task_id}, name: {task.name!r}); '
341
- f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
342
+ logger.info(f'Submitted managed job {self._job_id} '
343
+ f'(task: {task_id}, name: {task.name!r}); '
344
+ f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
342
345
 
343
- self._logger.info('Started monitoring.')
346
+ logger.info('Started monitoring.')
344
347
 
345
348
  # Only do the initial cluster launch if not resuming from a controller
346
349
  # failure. Otherwise, we will transit to recovering immediately.
@@ -354,7 +357,7 @@ class JobsController:
354
357
  remote_job_submitted_at = await self._strategy_executor.launch()
355
358
 
356
359
  launch_time = time.time() - launch_start
357
- self._logger.info(f'Cluster launch completed in {launch_time:.2f}s')
360
+ logger.info(f'Cluster launch completed in {launch_time:.2f}s')
358
361
  assert remote_job_submitted_at is not None, remote_job_submitted_at
359
362
  if self._pool is None:
360
363
  job_id_on_pool_cluster = None
@@ -367,16 +370,16 @@ class JobsController:
367
370
  # Check if we have been cancelled here, in the case where a user
368
371
  # quickly cancels the job we want to gracefully handle it here,
369
372
  # otherwise we will end up in the FAILED_CONTROLLER state.
370
- self._logger.info(f'Cluster name is None for job {self._job_id}, '
371
- f'task {task_id}. Checking if we have been '
372
- 'cancelled.')
373
+ logger.info(f'Cluster name is None for job {self._job_id}, '
374
+ f'task {task_id}. Checking if we have been '
375
+ 'cancelled.')
373
376
  status = await (managed_job_state.get_job_status_with_task_id_async(
374
377
  job_id=self._job_id, task_id=task_id))
375
- self._logger.debug(f'Status for job {self._job_id}, task {task_id}:'
376
- f'{status}')
378
+ logger.debug(f'Status for job {self._job_id}, task {task_id}:'
379
+ f'{status}')
377
380
  if status == managed_job_state.ManagedJobStatus.CANCELLED:
378
- self._logger.info(f'Job {self._job_id}, task {task_id} has '
379
- 'been quickly cancelled.')
381
+ logger.info(f'Job {self._job_id}, task {task_id} has '
382
+ 'been quickly cancelled.')
380
383
  raise asyncio.CancelledError()
381
384
  assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
382
385
 
@@ -417,7 +420,7 @@ class JobsController:
417
420
 
418
421
  if prev_status is not None:
419
422
  if prev_status.is_terminal():
420
- self._logger.info(
423
+ logger.info(
421
424
  f'Task {task_id} already in terminal state: '
422
425
  f'{prev_status}')
423
426
  return (prev_status ==
@@ -427,9 +430,8 @@ class JobsController:
427
430
  # If the controller is down when cancelling the job,
428
431
  # we re-raise the error to run the `_cleanup` function
429
432
  # again to clean up any remaining resources.
430
- self._logger.info(
431
- f'Task {task_id} was being cancelled, '
432
- 're-raising cancellation')
433
+ logger.info(f'Task {task_id} was being cancelled, '
434
+ 're-raising cancellation')
433
435
  raise asyncio.CancelledError()
434
436
  if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
435
437
  force_transit_to_recovering = True
@@ -443,10 +445,9 @@ class JobsController:
443
445
  try:
444
446
  await backend_utils.async_check_network_connection()
445
447
  except exceptions.NetworkError:
446
- self._logger.info(
447
- 'Network is not available. Retrying again in '
448
- f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
449
- 'seconds.')
448
+ logger.info('Network is not available. Retrying again in '
449
+ f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
450
+ 'seconds.')
450
451
  continue
451
452
 
452
453
  # NOTE: we do not check cluster status first because race condition
@@ -461,23 +462,22 @@ class JobsController:
461
462
  self._backend,
462
463
  cluster_name,
463
464
  job_id=job_id_on_pool_cluster,
464
- job_logger=self._logger,
465
465
  )
466
466
  except exceptions.FetchClusterInfoError as fetch_e:
467
- self._logger.info(
467
+ logger.info(
468
468
  'Failed to fetch the job status. Start recovery.\n'
469
469
  f'Exception: {common_utils.format_exception(fetch_e)}\n'
470
470
  f'Traceback: {traceback.format_exc()}')
471
471
 
472
472
  if job_status == job_lib.JobStatus.SUCCEEDED:
473
- self._logger.info(f'Task {task_id} succeeded! '
474
- 'Getting end time and cleaning up')
473
+ logger.info(f'Task {task_id} succeeded! '
474
+ 'Getting end time and cleaning up')
475
475
  try:
476
476
  success_end_time = await context_utils.to_thread(
477
477
  managed_job_utils.try_to_get_job_end_time,
478
478
  self._backend, cluster_name, job_id_on_pool_cluster)
479
479
  except Exception as e: # pylint: disable=broad-except
480
- self._logger.warning(
480
+ logger.warning(
481
481
  f'Failed to get job end time: '
482
482
  f'{common_utils.format_exception(e)}',
483
483
  exc_info=True)
@@ -490,7 +490,7 @@ class JobsController:
490
490
  task_id,
491
491
  end_time=success_end_time,
492
492
  callback_func=callback_func)
493
- self._logger.info(
493
+ logger.info(
494
494
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
495
495
  f'Cleaning up the cluster {cluster_name}.')
496
496
  try:
@@ -511,7 +511,7 @@ class JobsController:
511
511
  job_id_on_pool_cluster)
512
512
  except Exception as e: # pylint: disable=broad-except
513
513
  # We don't want to crash here, so just log and continue.
514
- self._logger.warning(
514
+ logger.warning(
515
515
  f'Failed to download and stream logs: '
516
516
  f'{common_utils.format_exception(e)}',
517
517
  exc_info=True)
@@ -521,10 +521,10 @@ class JobsController:
521
521
 
522
522
  task_total_time = time.time() - task_start_time
523
523
  monitoring_time = time.time() - monitoring_start_time
524
- self._logger.info(f'Task {task_id} completed successfully in '
525
- f'{task_total_time:.2f}s '
526
- f'(monitoring time: {monitoring_time:.2f}s, '
527
- f'status checks: {status_check_count})')
524
+ logger.info(f'Task {task_id} completed successfully in '
525
+ f'{task_total_time:.2f}s '
526
+ f'(monitoring time: {monitoring_time:.2f}s, '
527
+ f'status checks: {status_check_count})')
528
528
  return True
529
529
 
530
530
  # For single-node jobs, non-terminated job_status indicates a
@@ -560,7 +560,7 @@ class JobsController:
560
560
  # code).
561
561
  cluster_status_str = ('' if cluster_status is None else
562
562
  f' (status: {cluster_status.value})')
563
- self._logger.info(
563
+ logger.info(
564
564
  f'Cluster is preempted or failed{cluster_status_str}. '
565
565
  'Recovering...')
566
566
  else:
@@ -571,12 +571,12 @@ class JobsController:
571
571
  in job_lib.JobStatus.user_code_failure_states() or
572
572
  job_status == job_lib.JobStatus.FAILED_DRIVER):
573
573
  # The user code has probably crashed, fail immediately.
574
- self._logger.info(
574
+ logger.info(
575
575
  f'Task {task_id} failed with status: {job_status}')
576
576
  end_time = await context_utils.to_thread(
577
577
  managed_job_utils.try_to_get_job_end_time,
578
578
  self._backend, cluster_name, job_id_on_pool_cluster)
579
- self._logger.info(
579
+ logger.info(
580
580
  f'The user job failed ({job_status}). Please check the '
581
581
  'logs below.\n'
582
582
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
@@ -611,7 +611,7 @@ class JobsController:
611
611
  if should_restart_on_failure:
612
612
  max_restarts = (
613
613
  self._strategy_executor.max_restarts_on_errors)
614
- self._logger.info(
614
+ logger.info(
615
615
  f'User program crashed '
616
616
  f'({managed_job_status.value}). '
617
617
  f'Retry the job as max_restarts_on_errors is '
@@ -619,7 +619,7 @@ class JobsController:
619
619
  f'[{self._strategy_executor.restart_cnt_on_failure}'
620
620
  f'/{max_restarts}]')
621
621
  else:
622
- self._logger.info(
622
+ logger.info(
623
623
  f'Task {task_id} failed and will not be retried')
624
624
  await managed_job_state.set_failed_async(
625
625
  self._job_id,
@@ -632,7 +632,7 @@ class JobsController:
632
632
  elif job_status is not None:
633
633
  # Either the job is cancelled (should not happen) or in some
634
634
  # unknown new state that we do not handle.
635
- self._logger.error(f'Unknown job status: {job_status}')
635
+ logger.error(f'Unknown job status: {job_status}')
636
636
  failure_reason = (
637
637
  f'Unknown job status {job_status}. To see the details, '
638
638
  f'run: sky jobs logs --controller {self._job_id}')
@@ -649,10 +649,9 @@ class JobsController:
649
649
  # job status. Try to recover the job (will not restart the
650
650
  # cluster, if the cluster is healthy).
651
651
  assert job_status is None, job_status
652
- self._logger.info(
653
- 'Failed to fetch the job status while the '
654
- 'cluster is healthy. Try to recover the job '
655
- '(the cluster will not be restarted).')
652
+ logger.info('Failed to fetch the job status while the '
653
+ 'cluster is healthy. Try to recover the job '
654
+ '(the cluster will not be restarted).')
656
655
  # When the handle is None, the cluster should be cleaned up already.
657
656
  if handle is not None:
658
657
  resources = handle.launched_resources
@@ -671,15 +670,14 @@ class JobsController:
671
670
  # Some spot resource (e.g., Spot TPU VM) may need to be
672
671
  # cleaned up after preemption, as running launch again on
673
672
  # those clusters again may fail.
674
- self._logger.info(
675
- 'Cleaning up the preempted or failed cluster'
676
- '...')
673
+ logger.info('Cleaning up the preempted or failed cluster'
674
+ '...')
677
675
  await self._cleanup_cluster(cluster_name)
678
676
 
679
677
  # Try to recover the managed jobs, when the cluster is preempted or
680
678
  # failed or the job status is failed to be fetched.
681
- self._logger.info(f'Starting recovery for task {task_id}, '
682
- f'it is currently {job_status}')
679
+ logger.info(f'Starting recovery for task {task_id}, '
680
+ f'it is currently {job_status}')
683
681
  await managed_job_state.set_recovering_async(
684
682
  job_id=self._job_id,
685
683
  task_id=task_id,
@@ -701,7 +699,7 @@ class JobsController:
701
699
 
702
700
  async def run(self):
703
701
  """Run controller logic and handle exceptions."""
704
- self._logger.info(f'Starting JobsController run for job {self._job_id}')
702
+ logger.info(f'Starting JobsController run for job {self._job_id}')
705
703
  task_id = 0
706
704
  cancelled = False
707
705
 
@@ -709,39 +707,36 @@ class JobsController:
709
707
  succeeded = True
710
708
  # We support chain DAGs only for now.
711
709
  for task_id, task in enumerate(self._dag.tasks):
712
- self._logger.info(
710
+ logger.info(
713
711
  f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
714
712
  f'{task.name}')
715
713
  task_start = time.time()
716
714
  succeeded = await self._run_one_task(task_id, task)
717
715
  task_time = time.time() - task_start
718
- self._logger.info(
719
- f'Task {task_id} completed in {task_time:.2f}s '
720
- f'with success={succeeded}')
716
+ logger.info(f'Task {task_id} completed in {task_time:.2f}s '
717
+ f'with success={succeeded}')
721
718
 
722
719
  if not succeeded:
723
- self._logger.info(
724
- f'Task {task_id} failed, stopping execution')
720
+ logger.info(f'Task {task_id} failed, stopping execution')
725
721
  break
726
722
 
727
723
  except exceptions.ProvisionPrechecksError as e:
728
724
  # Please refer to the docstring of self._run for the cases when
729
725
  # this exception can occur.
730
- self._logger.error(f'Provision prechecks failed for task {task_id}')
726
+ logger.error(f'Provision prechecks failed for task {task_id}')
731
727
  failure_reason = ('; '.join(
732
728
  common_utils.format_exception(reason, use_bracket=True)
733
729
  for reason in e.reasons))
734
- self._logger.error(failure_reason)
730
+ logger.error(failure_reason)
735
731
  await self._update_failed_task_state(
736
732
  task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
737
733
  failure_reason)
738
734
  except exceptions.ManagedJobReachedMaxRetriesError as e:
739
735
  # Please refer to the docstring of self._run for the cases when
740
736
  # this exception can occur.
741
- self._logger.error(
742
- f'Managed job reached max retries for task {task_id}')
737
+ logger.error(f'Managed job reached max retries for task {task_id}')
743
738
  failure_reason = common_utils.format_exception(e)
744
- self._logger.error(failure_reason)
739
+ logger.error(failure_reason)
745
740
  # The managed job should be marked as FAILED_NO_RESOURCE, as the
746
741
  # managed job may be able to launch next time.
747
742
  await self._update_failed_task_state(
@@ -753,13 +748,13 @@ class JobsController:
753
748
  cancelled = True
754
749
  raise
755
750
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
756
- self._logger.error(
751
+ logger.error(
757
752
  f'Unexpected error in JobsController run for task {task_id}')
758
753
  with ux_utils.enable_traceback():
759
- self._logger.error(traceback.format_exc())
754
+ logger.error(traceback.format_exc())
760
755
  msg = ('Unexpected error occurred: ' +
761
756
  common_utils.format_exception(e, use_bracket=True))
762
- self._logger.error(msg)
757
+ logger.error(msg)
763
758
  await self._update_failed_task_state(
764
759
  task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
765
760
  msg)
@@ -783,8 +778,8 @@ class JobsController:
783
778
  failure_type: managed_job_state.ManagedJobStatus,
784
779
  failure_reason: str):
785
780
  """Update the state of the failed task."""
786
- self._logger.info(f'Updating failed task state: task_id={task_id}, '
787
- f'failure_type={failure_type}')
781
+ logger.info(f'Updating failed task state: task_id={task_id}, '
782
+ f'failure_type={failure_type}')
788
783
  await managed_job_state.set_failed_async(
789
784
  self._job_id,
790
785
  task_id=task_id,
@@ -796,10 +791,14 @@ class JobsController:
796
791
  task=self._dag.tasks[task_id]))
797
792
 
798
793
 
799
- class Controller:
800
- """Controller for managing jobs."""
794
+ class ControllerManager:
795
+ """Main loop for a job controller process.
796
+
797
+ Many jobs will be handled by this, each by a single JobController.
798
+ """
801
799
 
802
- def __init__(self) -> None:
800
+ def __init__(self, controller_uuid: str) -> None:
801
+ self._controller_uuid = controller_uuid
803
802
  # Global state for active jobs
804
803
  self.job_tasks: Dict[int, asyncio.Task] = {}
805
804
  self.starting: Set[int] = set()
@@ -813,11 +812,9 @@ class Controller:
813
812
  # launch).
814
813
  self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
815
814
 
816
- async def _cleanup(self,
817
- job_id: int,
818
- dag_yaml: str,
819
- job_logger: logging.Logger,
820
- pool: Optional[str] = None):
815
+ self._pid = os.getpid()
816
+
817
+ async def _cleanup(self, job_id: int, pool: Optional[str] = None):
821
818
  """Clean up the cluster(s) and storages.
822
819
 
823
820
  (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
@@ -842,14 +839,13 @@ class Controller:
842
839
  cluster_name = (
843
840
  managed_job_utils.generate_managed_job_cluster_name(
844
841
  task.name, job_id))
845
- managed_job_utils.terminate_cluster(cluster_name,
846
- _logger=job_logger)
842
+ managed_job_utils.terminate_cluster(cluster_name)
847
843
  status = core.status(cluster_names=[cluster_name],
848
844
  all_users=True)
849
845
  assert (len(status) == 0 or
850
846
  status[0]['status'] == sky.ClusterStatus.STOPPED), (
851
847
  f'{cluster_name} is not down: {status}')
852
- job_logger.info(f'{cluster_name} is down')
848
+ logger.info(f'{cluster_name} is down')
853
849
  else:
854
850
  cluster_name, job_id_on_pool_cluster = (
855
851
  managed_job_state.get_pool_submit_info(job_id))
@@ -860,7 +856,7 @@ class Controller:
860
856
  _try_cancel_if_cluster_is_init=True)
861
857
  except Exception as e: # pylint: disable=broad-except
862
858
  error = e
863
- job_logger.warning(
859
+ logger.warning(
864
860
  f'Failed to terminate cluster {cluster_name}: {e}')
865
861
  # we continue to try cleaning up whatever else we can.
866
862
  # Clean up Storages with persistent=False.
@@ -874,7 +870,7 @@ class Controller:
874
870
  for storage in task.storage_mounts.values():
875
871
  storage.construct()
876
872
  except (exceptions.StorageSpecError, exceptions.StorageError) as e:
877
- job_logger.warning(
873
+ logger.warning(
878
874
  f'Failed to construct storage object for teardown: {e}\n'
879
875
  'This may happen because storage construction already '
880
876
  'failed during launch, storage was deleted externally, '
@@ -884,7 +880,7 @@ class Controller:
884
880
  backend.teardown_ephemeral_storage(task)
885
881
  except Exception as e: # pylint: disable=broad-except
886
882
  error = e
887
- job_logger.warning(f'Failed to teardown ephemeral storage: {e}')
883
+ logger.warning(f'Failed to teardown ephemeral storage: {e}')
888
884
  # we continue to try cleaning up whatever else we can.
889
885
 
890
886
  # Clean up any files mounted from the local disk, such as two-hop
@@ -902,13 +898,13 @@ class Controller:
902
898
  else:
903
899
  os.remove(path)
904
900
  except Exception as e: # pylint: disable=broad-except
905
- job_logger.warning(
901
+ logger.warning(
906
902
  f'Failed to clean up file mount {file_mount}: {e}')
907
903
 
908
904
  if error is not None:
909
905
  raise error
910
906
 
911
- dag, _ = _get_dag_and_name(dag_yaml)
907
+ dag = _get_dag(job_id)
912
908
  error = None
913
909
  for task in dag.tasks:
914
910
  # most things in this function are blocking
@@ -924,58 +920,52 @@ class Controller:
924
920
 
925
921
  # Use context.contextual to enable per-job output redirection and env var
926
922
  # isolation.
927
- @context.contextual
923
+ @context.contextual_async
928
924
  async def run_job_loop(self,
929
925
  job_id: int,
930
- dag_yaml: str,
931
- job_logger: logging.Logger,
932
926
  log_file: str,
933
- env_file_path: Optional[str] = None,
934
927
  pool: Optional[str] = None):
935
928
  """Background task that runs the job loop."""
936
929
  ctx = context.get()
937
930
  assert ctx is not None, 'Context is not initialized'
938
931
  ctx.redirect_log(pathlib.Path(log_file))
939
932
 
940
- # Load and apply environment variables from the job's environment file
941
- if env_file_path and os.path.exists(env_file_path):
942
- try:
943
- # Load environment variables from the file
944
- env_vars = dotenv.dotenv_values(env_file_path)
945
- job_logger.info(f'Loading environment from {env_file_path}: '
946
- f'{list(env_vars.keys())}')
933
+ logger.info('Starting job loop for %s', job_id)
934
+ logger.info(' log_file=%s', log_file)
935
+ logger.info(' pool=%s', pool)
936
+ logger.info(f'From controller {self._controller_uuid}')
937
+ logger.info(f' pid={self._pid}')
947
938
 
948
- # Apply environment variables to the job's context
939
+ env_content = file_content_utils.get_job_env_content(job_id)
940
+ if env_content:
941
+ try:
942
+ env_vars = dotenv.dotenv_values(stream=io.StringIO(env_content))
943
+ logger.info('Loading %d environment variables for job %s',
944
+ len(env_vars), job_id)
949
945
  if ctx is not None:
950
946
  for key, value in env_vars.items():
951
947
  if value is not None:
952
948
  ctx.override_envs({key: value})
953
- job_logger.debug(
954
- f'Set environment variable: {key}={value}')
955
- # Reload the skypilot config for this context to make sure
956
- # the latest config is used.
949
+ logger.debug('Set environment variable: %s=%s', key,
950
+ value)
957
951
  skypilot_config.reload_config()
958
- else:
959
- job_logger.error(
960
- 'Context is None, cannot set environment variables')
952
+ else: # pragma: no cover - defensive
953
+ logger.error('Context is None, cannot set environment '
954
+ 'variables')
961
955
  except Exception as e: # pylint: disable=broad-except
962
- job_logger.error(
963
- f'Failed to load environment file {env_file_path}: {e}')
964
- elif env_file_path:
965
- job_logger.error(f'Environment file not found: {env_file_path}')
956
+ logger.error(
957
+ 'Failed to load environment variables for job %s: '
958
+ '%s', job_id, e)
966
959
 
967
960
  cancelling = False
968
961
  try:
969
- job_logger.info(f'Starting job loop for {job_id}')
970
-
971
- controller = JobsController(job_id, dag_yaml, job_logger,
972
- self.starting, self._job_tasks_lock,
973
- self._starting_signal, pool)
962
+ controller = JobController(job_id, self.starting,
963
+ self._job_tasks_lock,
964
+ self._starting_signal, pool)
974
965
 
975
966
  async with self._job_tasks_lock:
976
967
  if job_id in self.job_tasks:
977
- job_logger.error(
978
- f'Job {job_id} already exists in job_tasks')
968
+ logger.error(f'Job {job_id} already exists in job_tasks')
979
969
  raise ValueError(f'Job {job_id} already exists')
980
970
 
981
971
  # Create the task and store it
@@ -985,13 +975,13 @@ class Controller:
985
975
  self.job_tasks[job_id] = task
986
976
  await task
987
977
  except asyncio.CancelledError:
988
- job_logger.info(f'Job {job_id} was cancelled')
989
- dag, _ = _get_dag_and_name(dag_yaml)
978
+ logger.info(f'Job {job_id} was cancelled')
979
+ dag = _get_dag(job_id)
990
980
  task_id, _ = await (
991
981
  managed_job_state.get_latest_task_id_status_async(job_id))
992
982
  assert task_id is not None, job_id
993
- job_logger.info(f'Cancelling managed job, job_id: {job_id}, '
994
- f'task_id: {task_id}')
983
+ logger.info(f'Cancelling managed job, job_id: {job_id}, '
984
+ f'task_id: {task_id}')
995
985
  await managed_job_state.set_cancelling_async(
996
986
  job_id=job_id,
997
987
  callback_func=managed_job_utils.event_callback_func(
@@ -999,16 +989,13 @@ class Controller:
999
989
  cancelling = True
1000
990
  raise
1001
991
  except Exception as e:
1002
- job_logger.error(f'Unexpected error in job loop for {job_id}: '
1003
- f'{common_utils.format_exception(e)}')
992
+ logger.error(f'Unexpected error in job loop for {job_id}: '
993
+ f'{common_utils.format_exception(e)}')
1004
994
  raise
1005
995
  finally:
1006
996
  try:
1007
- await self._cleanup(job_id,
1008
- dag_yaml=dag_yaml,
1009
- job_logger=job_logger,
1010
- pool=pool)
1011
- job_logger.info(
997
+ await self._cleanup(job_id, pool=pool)
998
+ logger.info(
1012
999
  f'Cluster of managed job {job_id} has been cleaned up.')
1013
1000
  except Exception as e: # pylint: disable=broad-except
1014
1001
  failure_reason = ('Failed to clean up: '
@@ -1037,7 +1024,7 @@ class Controller:
1037
1024
  # The job can be non-terminal if the controller exited abnormally,
1038
1025
  # e.g. failed to launch cluster after reaching the MAX_RETRY.
1039
1026
  if not job_status.is_terminal():
1040
- job_logger.info(f'Previous job status: {job_status.value}')
1027
+ logger.info(f'Previous job status: {job_status.value}')
1041
1028
  await managed_job_state.set_failed_async(
1042
1029
  job_id,
1043
1030
  task_id=None,
@@ -1069,48 +1056,25 @@ class Controller:
1069
1056
  async def start_job(
1070
1057
  self,
1071
1058
  job_id: int,
1072
- dag_yaml: str,
1073
- env_file_path: Optional[str] = None,
1074
1059
  pool: Optional[str] = None,
1075
1060
  ):
1076
1061
  """Start a new job.
1077
1062
 
1078
1063
  Args:
1079
1064
  job_id: The ID of the job to start.
1080
- dag_yaml: Path to the YAML file containing the DAG definition.
1081
- env_file_path: Optional path to environment file for the job.
1082
1065
  """
1083
- # Create a job-specific logger
1066
+ # Create log file path for job output redirection
1084
1067
  log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
1085
1068
  os.makedirs(log_dir, exist_ok=True)
1086
1069
  log_file = os.path.join(log_dir, f'{job_id}.log')
1087
1070
 
1088
- job_logger = logging.getLogger(f'sky.jobs.{job_id}')
1089
- job_logger.setLevel(logging.DEBUG)
1090
-
1091
- # Create file handler
1092
- file_handler = logging.FileHandler(log_file)
1093
- file_handler.setLevel(logging.DEBUG)
1094
-
1095
- # Use Sky's standard formatter
1096
- file_handler.setFormatter(sky_logging.FORMATTER)
1097
-
1098
- # Add the handler to the logger
1099
- job_logger.addHandler(file_handler)
1100
-
1101
- # Prevent log propagation to avoid duplicate logs
1102
- job_logger.propagate = False
1103
-
1104
- job_logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
1105
- f'env_file_path={env_file_path}')
1071
+ logger.info(f'Starting job {job_id} with log_file={log_file}')
1106
1072
 
1107
1073
  async with self._job_tasks_lock:
1108
1074
  self.starting.add(job_id)
1109
- await create_background_task(
1110
- self.run_job_loop(job_id, dag_yaml, job_logger, log_file,
1111
- env_file_path, pool))
1075
+ await create_background_task(self.run_job_loop(job_id, log_file, pool))
1112
1076
 
1113
- job_logger.info(f'Job {job_id} started successfully')
1077
+ logger.info(f'Job {job_id} started successfully')
1114
1078
 
1115
1079
  async def cancel_job(self):
1116
1080
  """Cancel an existing job."""
@@ -1161,6 +1125,7 @@ class Controller:
1161
1125
  scheduler.get_number_of_controllers()))
1162
1126
 
1163
1127
  if len(running_tasks) >= max_jobs:
1128
+ logger.info('Too many jobs running, waiting for 60 seconds')
1164
1129
  await asyncio.sleep(60)
1165
1130
  continue
1166
1131
 
@@ -1174,12 +1139,12 @@ class Controller:
1174
1139
  continue
1175
1140
 
1176
1141
  if waiting_job is None:
1142
+ logger.info('No waiting job, waiting for 10 seconds')
1177
1143
  await asyncio.sleep(10)
1178
1144
  continue
1179
1145
 
1146
+ logger.info(f'Claiming job {waiting_job["job_id"]}')
1180
1147
  job_id = waiting_job['job_id']
1181
- dag_yaml_path = waiting_job['dag_yaml_path']
1182
- env_file_path = waiting_job.get('env_file_path')
1183
1148
  pool = waiting_job.get('pool', None)
1184
1149
 
1185
1150
  cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
@@ -1199,13 +1164,15 @@ class Controller:
1199
1164
  job_id=job_id, task_id=None, task=None))
1200
1165
  continue
1201
1166
 
1202
- await self.start_job(job_id, dag_yaml_path, env_file_path, pool)
1167
+ await self.start_job(job_id, pool)
1168
+
1203
1169
 
1170
+ async def main(controller_uuid: str):
1171
+ logger.info(f'Starting controller {controller_uuid}')
1204
1172
 
1205
- async def main():
1206
1173
  context_utils.hijack_sys_attrs()
1207
1174
 
1208
- controller = Controller()
1175
+ controller = ControllerManager(controller_uuid)
1209
1176
 
1210
1177
  # Will happen multiple times, who cares though
1211
1178
  os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
@@ -1214,6 +1181,8 @@ async def main():
1214
1181
  soft = None
1215
1182
  try:
1216
1183
  soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
1184
+ logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
1185
+ logger.info(f'Increasing soft limit to {hard}')
1217
1186
  resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
1218
1187
  except OSError as e:
1219
1188
  logger.warning(f'Failed to increase number of files we can open: {e}\n'
@@ -1222,7 +1191,10 @@ async def main():
1222
1191
  # Will loop forever, do it in the background
1223
1192
  cancel_job_task = asyncio.create_task(controller.cancel_job())
1224
1193
  monitor_loop_task = asyncio.create_task(controller.monitor_loop())
1225
-
1194
+ # Run the garbage collector in a dedicated daemon thread to avoid affecting
1195
+ # the main event loop.
1196
+ gc_thread = threading.Thread(target=log_gc.elect_for_log_gc, daemon=True)
1197
+ gc_thread.start()
1226
1198
  try:
1227
1199
  await asyncio.gather(cancel_job_task, monitor_loop_task)
1228
1200
  except Exception as e: # pylint: disable=broad-except
@@ -1231,4 +1203,4 @@ async def main():
1231
1203
 
1232
1204
 
1233
1205
  if __name__ == '__main__':
1234
- asyncio.run(main())
1206
+ asyncio.run(main(sys.argv[1]))