skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/shadeform.py +89 -0
- sky/authentication.py +52 -2
- sky/backends/backend_utils.py +35 -25
- sky/backends/cloud_vm_ray_backend.py +5 -5
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +19 -25
- sky/catalog/shadeform_catalog.py +165 -0
- sky/client/cli/command.py +53 -19
- sky/client/sdk.py +13 -1
- sky/clouds/__init__.py +2 -0
- sky/clouds/shadeform.py +393 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +122 -145
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +5 -5
- sky/jobs/state.py +65 -21
- sky/jobs/utils.py +58 -22
- sky/metrics/utils.py +27 -6
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/utils.py +44 -39
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/server/common.py +4 -2
- sky/server/requests/executor.py +25 -3
- sky/server/server.py +9 -3
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +23 -6
- sky/skylet/log_lib.py +0 -1
- sky/skylet/log_lib.pyi +1 -1
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/utils/common.py +2 -0
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +15 -11
- sky/utils/controller_utils.py +35 -8
- sky/utils/locks.py +20 -5
- sky/utils/subprocess_utils.py +4 -3
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
- /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Controller: handles scheduling and the life cycle of a managed job.
|
|
2
2
|
"""
|
|
3
3
|
import asyncio
|
|
4
|
-
import logging
|
|
5
4
|
import os
|
|
6
5
|
import pathlib
|
|
7
6
|
import resource
|
|
@@ -95,7 +94,6 @@ class JobsController:
|
|
|
95
94
|
- ``_dag_yaml`` / ``_dag`` / ``_dag_name``: The job definition and metadata.
|
|
96
95
|
- ``_backend``: Backend used to launch and manage clusters.
|
|
97
96
|
- ``_pool``: Optional pool name if using a cluster pool.
|
|
98
|
-
- ``_logger``: Job-scoped logger for progress and diagnostics.
|
|
99
97
|
- ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
|
|
100
98
|
coordination primitives. ``starting_lock`` must be used for accessing
|
|
101
99
|
``starting_signal`` and ``starting``
|
|
@@ -107,7 +105,6 @@ class JobsController:
|
|
|
107
105
|
self,
|
|
108
106
|
job_id: int,
|
|
109
107
|
dag_yaml: str,
|
|
110
|
-
job_logger: logging.Logger,
|
|
111
108
|
starting: Set[int],
|
|
112
109
|
starting_lock: asyncio.Lock,
|
|
113
110
|
starting_signal: asyncio.Condition,
|
|
@@ -118,7 +115,6 @@ class JobsController:
|
|
|
118
115
|
Args:
|
|
119
116
|
job_id: Integer ID of the managed job.
|
|
120
117
|
dag_yaml: Path to the YAML file containing the chain DAG to run.
|
|
121
|
-
job_logger: Logger instance dedicated to this job.
|
|
122
118
|
starting: Shared set of job IDs currently in the STARTING phase,
|
|
123
119
|
used to limit concurrent launches.
|
|
124
120
|
starting_lock: ``asyncio.Lock`` guarding access to the shared
|
|
@@ -134,14 +130,13 @@ class JobsController:
|
|
|
134
130
|
self.starting_lock = starting_lock
|
|
135
131
|
self.starting_signal = starting_signal
|
|
136
132
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
f'dag_yaml={dag_yaml}')
|
|
133
|
+
logger.info(f'Initializing JobsController for job_id={job_id}, '
|
|
134
|
+
f'dag_yaml={dag_yaml}')
|
|
140
135
|
|
|
141
136
|
self._job_id = job_id
|
|
142
137
|
self._dag_yaml = dag_yaml
|
|
143
138
|
self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
|
|
144
|
-
|
|
139
|
+
logger.info(f'Loaded DAG: {self._dag}')
|
|
145
140
|
|
|
146
141
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
147
142
|
self._pool = pool
|
|
@@ -191,8 +186,8 @@ class JobsController:
|
|
|
191
186
|
preemptions or ssh disconnection during the streaming.
|
|
192
187
|
"""
|
|
193
188
|
if handle is None:
|
|
194
|
-
|
|
195
|
-
|
|
189
|
+
logger.info(f'Cluster for job {self._job_id} is not found. '
|
|
190
|
+
'Skipping downloading and streaming the logs.')
|
|
196
191
|
return
|
|
197
192
|
|
|
198
193
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
@@ -210,11 +205,11 @@ class JobsController:
|
|
|
210
205
|
managed_job_state.set_local_log_file(self._job_id, task_id,
|
|
211
206
|
log_file)
|
|
212
207
|
else:
|
|
213
|
-
|
|
208
|
+
logger.warning(
|
|
214
209
|
f'No log file was downloaded for job {self._job_id}, '
|
|
215
210
|
f'task {task_id}')
|
|
216
211
|
|
|
217
|
-
|
|
212
|
+
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
|
218
213
|
|
|
219
214
|
async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
|
|
220
215
|
if cluster_name is None:
|
|
@@ -259,7 +254,7 @@ class JobsController:
|
|
|
259
254
|
Other exceptions may be raised depending on the backend.
|
|
260
255
|
"""
|
|
261
256
|
task_start_time = time.time()
|
|
262
|
-
|
|
257
|
+
logger.info(
|
|
263
258
|
f'Starting task {task_id} ({task.name}) for job {self._job_id}')
|
|
264
259
|
|
|
265
260
|
latest_task_id, last_task_prev_status = (
|
|
@@ -271,22 +266,20 @@ class JobsController:
|
|
|
271
266
|
managed_job_state.ManagedJobStatus.PENDING):
|
|
272
267
|
assert latest_task_id >= task_id, (latest_task_id, task_id)
|
|
273
268
|
if latest_task_id > task_id:
|
|
274
|
-
|
|
275
|
-
|
|
269
|
+
logger.info(f'Task {task_id} ({task.name}) has already '
|
|
270
|
+
'been executed. Skipping...')
|
|
276
271
|
return True
|
|
277
272
|
if latest_task_id == task_id:
|
|
278
273
|
# Start recovery.
|
|
279
274
|
is_resume = True
|
|
280
|
-
|
|
281
|
-
f'Resuming task {task_id} from previous execution')
|
|
275
|
+
logger.info(f'Resuming task {task_id} from previous execution')
|
|
282
276
|
|
|
283
277
|
callback_func = managed_job_utils.event_callback_func(
|
|
284
278
|
job_id=self._job_id, task_id=task_id, task=task)
|
|
285
279
|
|
|
286
280
|
if task.run is None:
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
'run commands being empty.')
|
|
281
|
+
logger.info(f'Skip running task {task_id} ({task.name}) due to its '
|
|
282
|
+
'run commands being empty.')
|
|
290
283
|
# Call set_started first to initialize columns in the state table,
|
|
291
284
|
# including start_at and last_recovery_at to avoid issues for
|
|
292
285
|
# uninitialized columns.
|
|
@@ -300,8 +293,7 @@ class JobsController:
|
|
|
300
293
|
task_id=task_id,
|
|
301
294
|
end_time=time.time(),
|
|
302
295
|
callback_func=callback_func)
|
|
303
|
-
|
|
304
|
-
f'Empty task {task_id} marked as succeeded immediately')
|
|
296
|
+
logger.info(f'Empty task {task_id} marked as succeeded immediately')
|
|
305
297
|
return True
|
|
306
298
|
|
|
307
299
|
usage_lib.messages.usage.update_task_id(task_id)
|
|
@@ -314,8 +306,7 @@ class JobsController:
|
|
|
314
306
|
task.name, self._job_id) if self._pool is None else None
|
|
315
307
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
|
316
308
|
cluster_name, self._backend, task, self._job_id, task_id,
|
|
317
|
-
self.
|
|
318
|
-
self.starting_signal)
|
|
309
|
+
self._pool, self.starting, self.starting_lock, self.starting_signal)
|
|
319
310
|
if not is_resume:
|
|
320
311
|
submitted_at = time.time()
|
|
321
312
|
if task_id == 0:
|
|
@@ -336,11 +327,11 @@ class JobsController:
|
|
|
336
327
|
self._strategy_executor.max_restarts_on_errors
|
|
337
328
|
},
|
|
338
329
|
callback_func=callback_func)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
330
|
+
logger.info(f'Submitted managed job {self._job_id} '
|
|
331
|
+
f'(task: {task_id}, name: {task.name!r}); '
|
|
332
|
+
f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
|
342
333
|
|
|
343
|
-
|
|
334
|
+
logger.info('Started monitoring.')
|
|
344
335
|
|
|
345
336
|
# Only do the initial cluster launch if not resuming from a controller
|
|
346
337
|
# failure. Otherwise, we will transit to recovering immediately.
|
|
@@ -354,7 +345,7 @@ class JobsController:
|
|
|
354
345
|
remote_job_submitted_at = await self._strategy_executor.launch()
|
|
355
346
|
|
|
356
347
|
launch_time = time.time() - launch_start
|
|
357
|
-
|
|
348
|
+
logger.info(f'Cluster launch completed in {launch_time:.2f}s')
|
|
358
349
|
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
|
359
350
|
if self._pool is None:
|
|
360
351
|
job_id_on_pool_cluster = None
|
|
@@ -367,16 +358,16 @@ class JobsController:
|
|
|
367
358
|
# Check if we have been cancelled here, in the case where a user
|
|
368
359
|
# quickly cancels the job we want to gracefully handle it here,
|
|
369
360
|
# otherwise we will end up in the FAILED_CONTROLLER state.
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
361
|
+
logger.info(f'Cluster name is None for job {self._job_id}, '
|
|
362
|
+
f'task {task_id}. Checking if we have been '
|
|
363
|
+
'cancelled.')
|
|
373
364
|
status = await (managed_job_state.get_job_status_with_task_id_async(
|
|
374
365
|
job_id=self._job_id, task_id=task_id))
|
|
375
|
-
|
|
376
|
-
|
|
366
|
+
logger.debug(f'Status for job {self._job_id}, task {task_id}:'
|
|
367
|
+
f'{status}')
|
|
377
368
|
if status == managed_job_state.ManagedJobStatus.CANCELLED:
|
|
378
|
-
|
|
379
|
-
|
|
369
|
+
logger.info(f'Job {self._job_id}, task {task_id} has '
|
|
370
|
+
'been quickly cancelled.')
|
|
380
371
|
raise asyncio.CancelledError()
|
|
381
372
|
assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
|
|
382
373
|
|
|
@@ -417,7 +408,7 @@ class JobsController:
|
|
|
417
408
|
|
|
418
409
|
if prev_status is not None:
|
|
419
410
|
if prev_status.is_terminal():
|
|
420
|
-
|
|
411
|
+
logger.info(
|
|
421
412
|
f'Task {task_id} already in terminal state: '
|
|
422
413
|
f'{prev_status}')
|
|
423
414
|
return (prev_status ==
|
|
@@ -427,9 +418,8 @@ class JobsController:
|
|
|
427
418
|
# If the controller is down when cancelling the job,
|
|
428
419
|
# we re-raise the error to run the `_cleanup` function
|
|
429
420
|
# again to clean up any remaining resources.
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
're-raising cancellation')
|
|
421
|
+
logger.info(f'Task {task_id} was being cancelled, '
|
|
422
|
+
're-raising cancellation')
|
|
433
423
|
raise asyncio.CancelledError()
|
|
434
424
|
if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
|
|
435
425
|
force_transit_to_recovering = True
|
|
@@ -443,10 +433,9 @@ class JobsController:
|
|
|
443
433
|
try:
|
|
444
434
|
await backend_utils.async_check_network_connection()
|
|
445
435
|
except exceptions.NetworkError:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
'seconds.')
|
|
436
|
+
logger.info('Network is not available. Retrying again in '
|
|
437
|
+
f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
|
|
438
|
+
'seconds.')
|
|
450
439
|
continue
|
|
451
440
|
|
|
452
441
|
# NOTE: we do not check cluster status first because race condition
|
|
@@ -461,23 +450,22 @@ class JobsController:
|
|
|
461
450
|
self._backend,
|
|
462
451
|
cluster_name,
|
|
463
452
|
job_id=job_id_on_pool_cluster,
|
|
464
|
-
job_logger=self._logger,
|
|
465
453
|
)
|
|
466
454
|
except exceptions.FetchClusterInfoError as fetch_e:
|
|
467
|
-
|
|
455
|
+
logger.info(
|
|
468
456
|
'Failed to fetch the job status. Start recovery.\n'
|
|
469
457
|
f'Exception: {common_utils.format_exception(fetch_e)}\n'
|
|
470
458
|
f'Traceback: {traceback.format_exc()}')
|
|
471
459
|
|
|
472
460
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
|
473
|
-
|
|
474
|
-
|
|
461
|
+
logger.info(f'Task {task_id} succeeded! '
|
|
462
|
+
'Getting end time and cleaning up')
|
|
475
463
|
try:
|
|
476
464
|
success_end_time = await context_utils.to_thread(
|
|
477
465
|
managed_job_utils.try_to_get_job_end_time,
|
|
478
466
|
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
479
467
|
except Exception as e: # pylint: disable=broad-except
|
|
480
|
-
|
|
468
|
+
logger.warning(
|
|
481
469
|
f'Failed to get job end time: '
|
|
482
470
|
f'{common_utils.format_exception(e)}',
|
|
483
471
|
exc_info=True)
|
|
@@ -490,7 +478,7 @@ class JobsController:
|
|
|
490
478
|
task_id,
|
|
491
479
|
end_time=success_end_time,
|
|
492
480
|
callback_func=callback_func)
|
|
493
|
-
|
|
481
|
+
logger.info(
|
|
494
482
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
|
495
483
|
f'Cleaning up the cluster {cluster_name}.')
|
|
496
484
|
try:
|
|
@@ -511,7 +499,7 @@ class JobsController:
|
|
|
511
499
|
job_id_on_pool_cluster)
|
|
512
500
|
except Exception as e: # pylint: disable=broad-except
|
|
513
501
|
# We don't want to crash here, so just log and continue.
|
|
514
|
-
|
|
502
|
+
logger.warning(
|
|
515
503
|
f'Failed to download and stream logs: '
|
|
516
504
|
f'{common_utils.format_exception(e)}',
|
|
517
505
|
exc_info=True)
|
|
@@ -521,10 +509,10 @@ class JobsController:
|
|
|
521
509
|
|
|
522
510
|
task_total_time = time.time() - task_start_time
|
|
523
511
|
monitoring_time = time.time() - monitoring_start_time
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
512
|
+
logger.info(f'Task {task_id} completed successfully in '
|
|
513
|
+
f'{task_total_time:.2f}s '
|
|
514
|
+
f'(monitoring time: {monitoring_time:.2f}s, '
|
|
515
|
+
f'status checks: {status_check_count})')
|
|
528
516
|
return True
|
|
529
517
|
|
|
530
518
|
# For single-node jobs, non-terminated job_status indicates a
|
|
@@ -560,7 +548,7 @@ class JobsController:
|
|
|
560
548
|
# code).
|
|
561
549
|
cluster_status_str = ('' if cluster_status is None else
|
|
562
550
|
f' (status: {cluster_status.value})')
|
|
563
|
-
|
|
551
|
+
logger.info(
|
|
564
552
|
f'Cluster is preempted or failed{cluster_status_str}. '
|
|
565
553
|
'Recovering...')
|
|
566
554
|
else:
|
|
@@ -571,12 +559,12 @@ class JobsController:
|
|
|
571
559
|
in job_lib.JobStatus.user_code_failure_states() or
|
|
572
560
|
job_status == job_lib.JobStatus.FAILED_DRIVER):
|
|
573
561
|
# The user code has probably crashed, fail immediately.
|
|
574
|
-
|
|
562
|
+
logger.info(
|
|
575
563
|
f'Task {task_id} failed with status: {job_status}')
|
|
576
564
|
end_time = await context_utils.to_thread(
|
|
577
565
|
managed_job_utils.try_to_get_job_end_time,
|
|
578
566
|
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
579
|
-
|
|
567
|
+
logger.info(
|
|
580
568
|
f'The user job failed ({job_status}). Please check the '
|
|
581
569
|
'logs below.\n'
|
|
582
570
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
|
@@ -611,7 +599,7 @@ class JobsController:
|
|
|
611
599
|
if should_restart_on_failure:
|
|
612
600
|
max_restarts = (
|
|
613
601
|
self._strategy_executor.max_restarts_on_errors)
|
|
614
|
-
|
|
602
|
+
logger.info(
|
|
615
603
|
f'User program crashed '
|
|
616
604
|
f'({managed_job_status.value}). '
|
|
617
605
|
f'Retry the job as max_restarts_on_errors is '
|
|
@@ -619,7 +607,7 @@ class JobsController:
|
|
|
619
607
|
f'[{self._strategy_executor.restart_cnt_on_failure}'
|
|
620
608
|
f'/{max_restarts}]')
|
|
621
609
|
else:
|
|
622
|
-
|
|
610
|
+
logger.info(
|
|
623
611
|
f'Task {task_id} failed and will not be retried')
|
|
624
612
|
await managed_job_state.set_failed_async(
|
|
625
613
|
self._job_id,
|
|
@@ -632,7 +620,7 @@ class JobsController:
|
|
|
632
620
|
elif job_status is not None:
|
|
633
621
|
# Either the job is cancelled (should not happen) or in some
|
|
634
622
|
# unknown new state that we do not handle.
|
|
635
|
-
|
|
623
|
+
logger.error(f'Unknown job status: {job_status}')
|
|
636
624
|
failure_reason = (
|
|
637
625
|
f'Unknown job status {job_status}. To see the details, '
|
|
638
626
|
f'run: sky jobs logs --controller {self._job_id}')
|
|
@@ -649,10 +637,9 @@ class JobsController:
|
|
|
649
637
|
# job status. Try to recover the job (will not restart the
|
|
650
638
|
# cluster, if the cluster is healthy).
|
|
651
639
|
assert job_status is None, job_status
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
'(the cluster will not be restarted).')
|
|
640
|
+
logger.info('Failed to fetch the job status while the '
|
|
641
|
+
'cluster is healthy. Try to recover the job '
|
|
642
|
+
'(the cluster will not be restarted).')
|
|
656
643
|
# When the handle is None, the cluster should be cleaned up already.
|
|
657
644
|
if handle is not None:
|
|
658
645
|
resources = handle.launched_resources
|
|
@@ -671,15 +658,14 @@ class JobsController:
|
|
|
671
658
|
# Some spot resource (e.g., Spot TPU VM) may need to be
|
|
672
659
|
# cleaned up after preemption, as running launch again on
|
|
673
660
|
# those clusters again may fail.
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
'...')
|
|
661
|
+
logger.info('Cleaning up the preempted or failed cluster'
|
|
662
|
+
'...')
|
|
677
663
|
await self._cleanup_cluster(cluster_name)
|
|
678
664
|
|
|
679
665
|
# Try to recover the managed jobs, when the cluster is preempted or
|
|
680
666
|
# failed or the job status is failed to be fetched.
|
|
681
|
-
|
|
682
|
-
|
|
667
|
+
logger.info(f'Starting recovery for task {task_id}, '
|
|
668
|
+
f'it is currently {job_status}')
|
|
683
669
|
await managed_job_state.set_recovering_async(
|
|
684
670
|
job_id=self._job_id,
|
|
685
671
|
task_id=task_id,
|
|
@@ -701,7 +687,7 @@ class JobsController:
|
|
|
701
687
|
|
|
702
688
|
async def run(self):
|
|
703
689
|
"""Run controller logic and handle exceptions."""
|
|
704
|
-
|
|
690
|
+
logger.info(f'Starting JobsController run for job {self._job_id}')
|
|
705
691
|
task_id = 0
|
|
706
692
|
cancelled = False
|
|
707
693
|
|
|
@@ -709,39 +695,36 @@ class JobsController:
|
|
|
709
695
|
succeeded = True
|
|
710
696
|
# We support chain DAGs only for now.
|
|
711
697
|
for task_id, task in enumerate(self._dag.tasks):
|
|
712
|
-
|
|
698
|
+
logger.info(
|
|
713
699
|
f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
|
|
714
700
|
f'{task.name}')
|
|
715
701
|
task_start = time.time()
|
|
716
702
|
succeeded = await self._run_one_task(task_id, task)
|
|
717
703
|
task_time = time.time() - task_start
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
f'with success={succeeded}')
|
|
704
|
+
logger.info(f'Task {task_id} completed in {task_time:.2f}s '
|
|
705
|
+
f'with success={succeeded}')
|
|
721
706
|
|
|
722
707
|
if not succeeded:
|
|
723
|
-
|
|
724
|
-
f'Task {task_id} failed, stopping execution')
|
|
708
|
+
logger.info(f'Task {task_id} failed, stopping execution')
|
|
725
709
|
break
|
|
726
710
|
|
|
727
711
|
except exceptions.ProvisionPrechecksError as e:
|
|
728
712
|
# Please refer to the docstring of self._run for the cases when
|
|
729
713
|
# this exception can occur.
|
|
730
|
-
|
|
714
|
+
logger.error(f'Provision prechecks failed for task {task_id}')
|
|
731
715
|
failure_reason = ('; '.join(
|
|
732
716
|
common_utils.format_exception(reason, use_bracket=True)
|
|
733
717
|
for reason in e.reasons))
|
|
734
|
-
|
|
718
|
+
logger.error(failure_reason)
|
|
735
719
|
await self._update_failed_task_state(
|
|
736
720
|
task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
|
|
737
721
|
failure_reason)
|
|
738
722
|
except exceptions.ManagedJobReachedMaxRetriesError as e:
|
|
739
723
|
# Please refer to the docstring of self._run for the cases when
|
|
740
724
|
# this exception can occur.
|
|
741
|
-
|
|
742
|
-
f'Managed job reached max retries for task {task_id}')
|
|
725
|
+
logger.error(f'Managed job reached max retries for task {task_id}')
|
|
743
726
|
failure_reason = common_utils.format_exception(e)
|
|
744
|
-
|
|
727
|
+
logger.error(failure_reason)
|
|
745
728
|
# The managed job should be marked as FAILED_NO_RESOURCE, as the
|
|
746
729
|
# managed job may be able to launch next time.
|
|
747
730
|
await self._update_failed_task_state(
|
|
@@ -753,13 +736,13 @@ class JobsController:
|
|
|
753
736
|
cancelled = True
|
|
754
737
|
raise
|
|
755
738
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
756
|
-
|
|
739
|
+
logger.error(
|
|
757
740
|
f'Unexpected error in JobsController run for task {task_id}')
|
|
758
741
|
with ux_utils.enable_traceback():
|
|
759
|
-
|
|
742
|
+
logger.error(traceback.format_exc())
|
|
760
743
|
msg = ('Unexpected error occurred: ' +
|
|
761
744
|
common_utils.format_exception(e, use_bracket=True))
|
|
762
|
-
|
|
745
|
+
logger.error(msg)
|
|
763
746
|
await self._update_failed_task_state(
|
|
764
747
|
task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
|
765
748
|
msg)
|
|
@@ -783,8 +766,8 @@ class JobsController:
|
|
|
783
766
|
failure_type: managed_job_state.ManagedJobStatus,
|
|
784
767
|
failure_reason: str):
|
|
785
768
|
"""Update the state of the failed task."""
|
|
786
|
-
|
|
787
|
-
|
|
769
|
+
logger.info(f'Updating failed task state: task_id={task_id}, '
|
|
770
|
+
f'failure_type={failure_type}')
|
|
788
771
|
await managed_job_state.set_failed_async(
|
|
789
772
|
self._job_id,
|
|
790
773
|
task_id=task_id,
|
|
@@ -799,7 +782,8 @@ class JobsController:
|
|
|
799
782
|
class Controller:
|
|
800
783
|
"""Controller for managing jobs."""
|
|
801
784
|
|
|
802
|
-
def __init__(self) -> None:
|
|
785
|
+
def __init__(self, controller_uuid: str) -> None:
|
|
786
|
+
self._controller_uuid = controller_uuid
|
|
803
787
|
# Global state for active jobs
|
|
804
788
|
self.job_tasks: Dict[int, asyncio.Task] = {}
|
|
805
789
|
self.starting: Set[int] = set()
|
|
@@ -813,10 +797,11 @@ class Controller:
|
|
|
813
797
|
# launch).
|
|
814
798
|
self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
|
|
815
799
|
|
|
800
|
+
self._pid = os.getpid()
|
|
801
|
+
|
|
816
802
|
async def _cleanup(self,
|
|
817
803
|
job_id: int,
|
|
818
804
|
dag_yaml: str,
|
|
819
|
-
job_logger: logging.Logger,
|
|
820
805
|
pool: Optional[str] = None):
|
|
821
806
|
"""Clean up the cluster(s) and storages.
|
|
822
807
|
|
|
@@ -842,14 +827,13 @@ class Controller:
|
|
|
842
827
|
cluster_name = (
|
|
843
828
|
managed_job_utils.generate_managed_job_cluster_name(
|
|
844
829
|
task.name, job_id))
|
|
845
|
-
managed_job_utils.terminate_cluster(cluster_name
|
|
846
|
-
_logger=job_logger)
|
|
830
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
|
847
831
|
status = core.status(cluster_names=[cluster_name],
|
|
848
832
|
all_users=True)
|
|
849
833
|
assert (len(status) == 0 or
|
|
850
834
|
status[0]['status'] == sky.ClusterStatus.STOPPED), (
|
|
851
835
|
f'{cluster_name} is not down: {status}')
|
|
852
|
-
|
|
836
|
+
logger.info(f'{cluster_name} is down')
|
|
853
837
|
else:
|
|
854
838
|
cluster_name, job_id_on_pool_cluster = (
|
|
855
839
|
managed_job_state.get_pool_submit_info(job_id))
|
|
@@ -860,7 +844,7 @@ class Controller:
|
|
|
860
844
|
_try_cancel_if_cluster_is_init=True)
|
|
861
845
|
except Exception as e: # pylint: disable=broad-except
|
|
862
846
|
error = e
|
|
863
|
-
|
|
847
|
+
logger.warning(
|
|
864
848
|
f'Failed to terminate cluster {cluster_name}: {e}')
|
|
865
849
|
# we continue to try cleaning up whatever else we can.
|
|
866
850
|
# Clean up Storages with persistent=False.
|
|
@@ -874,7 +858,7 @@ class Controller:
|
|
|
874
858
|
for storage in task.storage_mounts.values():
|
|
875
859
|
storage.construct()
|
|
876
860
|
except (exceptions.StorageSpecError, exceptions.StorageError) as e:
|
|
877
|
-
|
|
861
|
+
logger.warning(
|
|
878
862
|
f'Failed to construct storage object for teardown: {e}\n'
|
|
879
863
|
'This may happen because storage construction already '
|
|
880
864
|
'failed during launch, storage was deleted externally, '
|
|
@@ -884,7 +868,7 @@ class Controller:
|
|
|
884
868
|
backend.teardown_ephemeral_storage(task)
|
|
885
869
|
except Exception as e: # pylint: disable=broad-except
|
|
886
870
|
error = e
|
|
887
|
-
|
|
871
|
+
logger.warning(f'Failed to teardown ephemeral storage: {e}')
|
|
888
872
|
# we continue to try cleaning up whatever else we can.
|
|
889
873
|
|
|
890
874
|
# Clean up any files mounted from the local disk, such as two-hop
|
|
@@ -902,7 +886,7 @@ class Controller:
|
|
|
902
886
|
else:
|
|
903
887
|
os.remove(path)
|
|
904
888
|
except Exception as e: # pylint: disable=broad-except
|
|
905
|
-
|
|
889
|
+
logger.warning(
|
|
906
890
|
f'Failed to clean up file mount {file_mount}: {e}')
|
|
907
891
|
|
|
908
892
|
if error is not None:
|
|
@@ -924,11 +908,10 @@ class Controller:
|
|
|
924
908
|
|
|
925
909
|
# Use context.contextual to enable per-job output redirection and env var
|
|
926
910
|
# isolation.
|
|
927
|
-
@context.
|
|
911
|
+
@context.contextual_async
|
|
928
912
|
async def run_job_loop(self,
|
|
929
913
|
job_id: int,
|
|
930
914
|
dag_yaml: str,
|
|
931
|
-
job_logger: logging.Logger,
|
|
932
915
|
log_file: str,
|
|
933
916
|
env_file_path: Optional[str] = None,
|
|
934
917
|
pool: Optional[str] = None):
|
|
@@ -937,45 +920,52 @@ class Controller:
|
|
|
937
920
|
assert ctx is not None, 'Context is not initialized'
|
|
938
921
|
ctx.redirect_log(pathlib.Path(log_file))
|
|
939
922
|
|
|
923
|
+
logger.info(f'Starting job loop for {job_id}')
|
|
924
|
+
logger.info(f' dag_yaml={dag_yaml}')
|
|
925
|
+
logger.info(f' log_file={log_file}')
|
|
926
|
+
logger.info(f' env_file_path={env_file_path}')
|
|
927
|
+
logger.info(f' pool={pool}')
|
|
928
|
+
logger.info(f'From controller {self._controller_uuid}')
|
|
929
|
+
logger.info(f' pid={self._pid}')
|
|
930
|
+
|
|
940
931
|
# Load and apply environment variables from the job's environment file
|
|
941
932
|
if env_file_path and os.path.exists(env_file_path):
|
|
942
933
|
try:
|
|
943
934
|
# Load environment variables from the file
|
|
944
935
|
env_vars = dotenv.dotenv_values(env_file_path)
|
|
945
|
-
|
|
946
|
-
|
|
936
|
+
logger.info(f'Loading environment from {env_file_path}: '
|
|
937
|
+
f'{list(env_vars.keys())}')
|
|
947
938
|
|
|
948
939
|
# Apply environment variables to the job's context
|
|
949
940
|
if ctx is not None:
|
|
950
941
|
for key, value in env_vars.items():
|
|
951
942
|
if value is not None:
|
|
952
943
|
ctx.override_envs({key: value})
|
|
953
|
-
|
|
944
|
+
logger.debug(
|
|
954
945
|
f'Set environment variable: {key}={value}')
|
|
955
946
|
# Reload the skypilot config for this context to make sure
|
|
956
947
|
# the latest config is used.
|
|
957
948
|
skypilot_config.reload_config()
|
|
958
949
|
else:
|
|
959
|
-
|
|
950
|
+
logger.error(
|
|
960
951
|
'Context is None, cannot set environment variables')
|
|
961
952
|
except Exception as e: # pylint: disable=broad-except
|
|
962
|
-
|
|
953
|
+
logger.error(
|
|
963
954
|
f'Failed to load environment file {env_file_path}: {e}')
|
|
964
955
|
elif env_file_path:
|
|
965
|
-
|
|
956
|
+
logger.error(f'Environment file not found: {env_file_path}')
|
|
966
957
|
|
|
967
958
|
cancelling = False
|
|
968
959
|
try:
|
|
969
|
-
|
|
960
|
+
logger.info(f'Starting job loop for {job_id}')
|
|
970
961
|
|
|
971
|
-
controller = JobsController(job_id, dag_yaml,
|
|
972
|
-
self.
|
|
962
|
+
controller = JobsController(job_id, dag_yaml, self.starting,
|
|
963
|
+
self._job_tasks_lock,
|
|
973
964
|
self._starting_signal, pool)
|
|
974
965
|
|
|
975
966
|
async with self._job_tasks_lock:
|
|
976
967
|
if job_id in self.job_tasks:
|
|
977
|
-
|
|
978
|
-
f'Job {job_id} already exists in job_tasks')
|
|
968
|
+
logger.error(f'Job {job_id} already exists in job_tasks')
|
|
979
969
|
raise ValueError(f'Job {job_id} already exists')
|
|
980
970
|
|
|
981
971
|
# Create the task and store it
|
|
@@ -985,13 +975,13 @@ class Controller:
|
|
|
985
975
|
self.job_tasks[job_id] = task
|
|
986
976
|
await task
|
|
987
977
|
except asyncio.CancelledError:
|
|
988
|
-
|
|
978
|
+
logger.info(f'Job {job_id} was cancelled')
|
|
989
979
|
dag, _ = _get_dag_and_name(dag_yaml)
|
|
990
980
|
task_id, _ = await (
|
|
991
981
|
managed_job_state.get_latest_task_id_status_async(job_id))
|
|
992
982
|
assert task_id is not None, job_id
|
|
993
|
-
|
|
994
|
-
|
|
983
|
+
logger.info(f'Cancelling managed job, job_id: {job_id}, '
|
|
984
|
+
f'task_id: {task_id}')
|
|
995
985
|
await managed_job_state.set_cancelling_async(
|
|
996
986
|
job_id=job_id,
|
|
997
987
|
callback_func=managed_job_utils.event_callback_func(
|
|
@@ -999,16 +989,13 @@ class Controller:
|
|
|
999
989
|
cancelling = True
|
|
1000
990
|
raise
|
|
1001
991
|
except Exception as e:
|
|
1002
|
-
|
|
1003
|
-
|
|
992
|
+
logger.error(f'Unexpected error in job loop for {job_id}: '
|
|
993
|
+
f'{common_utils.format_exception(e)}')
|
|
1004
994
|
raise
|
|
1005
995
|
finally:
|
|
1006
996
|
try:
|
|
1007
|
-
await self._cleanup(job_id,
|
|
1008
|
-
|
|
1009
|
-
job_logger=job_logger,
|
|
1010
|
-
pool=pool)
|
|
1011
|
-
job_logger.info(
|
|
997
|
+
await self._cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
|
|
998
|
+
logger.info(
|
|
1012
999
|
f'Cluster of managed job {job_id} has been cleaned up.')
|
|
1013
1000
|
except Exception as e: # pylint: disable=broad-except
|
|
1014
1001
|
failure_reason = ('Failed to clean up: '
|
|
@@ -1037,7 +1024,7 @@ class Controller:
|
|
|
1037
1024
|
# The job can be non-terminal if the controller exited abnormally,
|
|
1038
1025
|
# e.g. failed to launch cluster after reaching the MAX_RETRY.
|
|
1039
1026
|
if not job_status.is_terminal():
|
|
1040
|
-
|
|
1027
|
+
logger.info(f'Previous job status: {job_status.value}')
|
|
1041
1028
|
await managed_job_state.set_failed_async(
|
|
1042
1029
|
job_id,
|
|
1043
1030
|
task_id=None,
|
|
@@ -1080,37 +1067,20 @@ class Controller:
|
|
|
1080
1067
|
dag_yaml: Path to the YAML file containing the DAG definition.
|
|
1081
1068
|
env_file_path: Optional path to environment file for the job.
|
|
1082
1069
|
"""
|
|
1083
|
-
# Create
|
|
1070
|
+
# Create log file path for job output redirection
|
|
1084
1071
|
log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
1085
1072
|
os.makedirs(log_dir, exist_ok=True)
|
|
1086
1073
|
log_file = os.path.join(log_dir, f'{job_id}.log')
|
|
1087
1074
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
# Create file handler
|
|
1092
|
-
file_handler = logging.FileHandler(log_file)
|
|
1093
|
-
file_handler.setLevel(logging.DEBUG)
|
|
1094
|
-
|
|
1095
|
-
# Use Sky's standard formatter
|
|
1096
|
-
file_handler.setFormatter(sky_logging.FORMATTER)
|
|
1097
|
-
|
|
1098
|
-
# Add the handler to the logger
|
|
1099
|
-
job_logger.addHandler(file_handler)
|
|
1100
|
-
|
|
1101
|
-
# Prevent log propagation to avoid duplicate logs
|
|
1102
|
-
job_logger.propagate = False
|
|
1103
|
-
|
|
1104
|
-
job_logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
|
|
1105
|
-
f'env_file_path={env_file_path}')
|
|
1075
|
+
logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
|
|
1076
|
+
f'env_file_path={env_file_path}, and log_file={log_file}')
|
|
1106
1077
|
|
|
1107
1078
|
async with self._job_tasks_lock:
|
|
1108
1079
|
self.starting.add(job_id)
|
|
1109
1080
|
await create_background_task(
|
|
1110
|
-
self.run_job_loop(job_id, dag_yaml,
|
|
1111
|
-
env_file_path, pool))
|
|
1081
|
+
self.run_job_loop(job_id, dag_yaml, log_file, env_file_path, pool))
|
|
1112
1082
|
|
|
1113
|
-
|
|
1083
|
+
logger.info(f'Job {job_id} started successfully')
|
|
1114
1084
|
|
|
1115
1085
|
async def cancel_job(self):
|
|
1116
1086
|
"""Cancel an existing job."""
|
|
@@ -1161,6 +1131,7 @@ class Controller:
|
|
|
1161
1131
|
scheduler.get_number_of_controllers()))
|
|
1162
1132
|
|
|
1163
1133
|
if len(running_tasks) >= max_jobs:
|
|
1134
|
+
logger.info('Too many jobs running, waiting for 60 seconds')
|
|
1164
1135
|
await asyncio.sleep(60)
|
|
1165
1136
|
continue
|
|
1166
1137
|
|
|
@@ -1174,9 +1145,11 @@ class Controller:
|
|
|
1174
1145
|
continue
|
|
1175
1146
|
|
|
1176
1147
|
if waiting_job is None:
|
|
1148
|
+
logger.info('No waiting job, waiting for 10 seconds')
|
|
1177
1149
|
await asyncio.sleep(10)
|
|
1178
1150
|
continue
|
|
1179
1151
|
|
|
1152
|
+
logger.info(f'Claiming job {waiting_job["job_id"]}')
|
|
1180
1153
|
job_id = waiting_job['job_id']
|
|
1181
1154
|
dag_yaml_path = waiting_job['dag_yaml_path']
|
|
1182
1155
|
env_file_path = waiting_job.get('env_file_path')
|
|
@@ -1202,10 +1175,12 @@ class Controller:
|
|
|
1202
1175
|
await self.start_job(job_id, dag_yaml_path, env_file_path, pool)
|
|
1203
1176
|
|
|
1204
1177
|
|
|
1205
|
-
async def main():
|
|
1178
|
+
async def main(controller_uuid: str):
|
|
1179
|
+
logger.info(f'Starting controller {controller_uuid}')
|
|
1180
|
+
|
|
1206
1181
|
context_utils.hijack_sys_attrs()
|
|
1207
1182
|
|
|
1208
|
-
controller = Controller()
|
|
1183
|
+
controller = Controller(controller_uuid)
|
|
1209
1184
|
|
|
1210
1185
|
# Will happen multiple times, who cares though
|
|
1211
1186
|
os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
|
|
@@ -1214,6 +1189,8 @@ async def main():
|
|
|
1214
1189
|
soft = None
|
|
1215
1190
|
try:
|
|
1216
1191
|
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
1192
|
+
logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
|
|
1193
|
+
logger.info(f'Increasing soft limit to {hard}')
|
|
1217
1194
|
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
|
|
1218
1195
|
except OSError as e:
|
|
1219
1196
|
logger.warning(f'Failed to increase number of files we can open: {e}\n'
|
|
@@ -1231,4 +1208,4 @@ async def main():
|
|
|
1231
1208
|
|
|
1232
1209
|
|
|
1233
1210
|
if __name__ == '__main__':
|
|
1234
|
-
asyncio.run(main())
|
|
1211
|
+
asyncio.run(main(sys.argv[1]))
|