skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (63) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/shadeform.py +89 -0
  3. sky/authentication.py +52 -2
  4. sky/backends/backend_utils.py +35 -25
  5. sky/backends/cloud_vm_ray_backend.py +5 -5
  6. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  7. sky/catalog/kubernetes_catalog.py +19 -25
  8. sky/catalog/shadeform_catalog.py +165 -0
  9. sky/client/cli/command.py +53 -19
  10. sky/client/sdk.py +13 -1
  11. sky/clouds/__init__.py +2 -0
  12. sky/clouds/shadeform.py +393 -0
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/jobs/controller.py +122 -145
  30. sky/jobs/recovery_strategy.py +59 -82
  31. sky/jobs/scheduler.py +5 -5
  32. sky/jobs/state.py +65 -21
  33. sky/jobs/utils.py +58 -22
  34. sky/metrics/utils.py +27 -6
  35. sky/provision/__init__.py +1 -0
  36. sky/provision/kubernetes/utils.py +44 -39
  37. sky/provision/shadeform/__init__.py +11 -0
  38. sky/provision/shadeform/config.py +12 -0
  39. sky/provision/shadeform/instance.py +351 -0
  40. sky/provision/shadeform/shadeform_utils.py +83 -0
  41. sky/server/common.py +4 -2
  42. sky/server/requests/executor.py +25 -3
  43. sky/server/server.py +9 -3
  44. sky/setup_files/dependencies.py +1 -0
  45. sky/sky_logging.py +0 -2
  46. sky/skylet/constants.py +23 -6
  47. sky/skylet/log_lib.py +0 -1
  48. sky/skylet/log_lib.pyi +1 -1
  49. sky/templates/shadeform-ray.yml.j2 +72 -0
  50. sky/utils/common.py +2 -0
  51. sky/utils/context.py +57 -51
  52. sky/utils/context_utils.py +15 -11
  53. sky/utils/controller_utils.py +35 -8
  54. sky/utils/locks.py +20 -5
  55. sky/utils/subprocess_utils.py +4 -3
  56. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
  57. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
  58. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
  59. /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
  60. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
  61. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
  62. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
  63. {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Controller: handles scheduling and the life cycle of a managed job.
2
2
  """
3
3
  import asyncio
4
- import logging
5
4
  import os
6
5
  import pathlib
7
6
  import resource
@@ -95,7 +94,6 @@ class JobsController:
95
94
  - ``_dag_yaml`` / ``_dag`` / ``_dag_name``: The job definition and metadata.
96
95
  - ``_backend``: Backend used to launch and manage clusters.
97
96
  - ``_pool``: Optional pool name if using a cluster pool.
98
- - ``_logger``: Job-scoped logger for progress and diagnostics.
99
97
  - ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
100
98
  coordination primitives. ``starting_lock`` must be used for accessing
101
99
  ``starting_signal`` and ``starting``
@@ -107,7 +105,6 @@ class JobsController:
107
105
  self,
108
106
  job_id: int,
109
107
  dag_yaml: str,
110
- job_logger: logging.Logger,
111
108
  starting: Set[int],
112
109
  starting_lock: asyncio.Lock,
113
110
  starting_signal: asyncio.Condition,
@@ -118,7 +115,6 @@ class JobsController:
118
115
  Args:
119
116
  job_id: Integer ID of the managed job.
120
117
  dag_yaml: Path to the YAML file containing the chain DAG to run.
121
- job_logger: Logger instance dedicated to this job.
122
118
  starting: Shared set of job IDs currently in the STARTING phase,
123
119
  used to limit concurrent launches.
124
120
  starting_lock: ``asyncio.Lock`` guarding access to the shared
@@ -134,14 +130,13 @@ class JobsController:
134
130
  self.starting_lock = starting_lock
135
131
  self.starting_signal = starting_signal
136
132
 
137
- self._logger = job_logger
138
- self._logger.info(f'Initializing JobsController for job_id={job_id}, '
139
- f'dag_yaml={dag_yaml}')
133
+ logger.info(f'Initializing JobsController for job_id={job_id}, '
134
+ f'dag_yaml={dag_yaml}')
140
135
 
141
136
  self._job_id = job_id
142
137
  self._dag_yaml = dag_yaml
143
138
  self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
144
- self._logger.info(f'Loaded DAG: {self._dag}')
139
+ logger.info(f'Loaded DAG: {self._dag}')
145
140
 
146
141
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
147
142
  self._pool = pool
@@ -191,8 +186,8 @@ class JobsController:
191
186
  preemptions or ssh disconnection during the streaming.
192
187
  """
193
188
  if handle is None:
194
- self._logger.info(f'Cluster for job {self._job_id} is not found. '
195
- 'Skipping downloading and streaming the logs.')
189
+ logger.info(f'Cluster for job {self._job_id} is not found. '
190
+ 'Skipping downloading and streaming the logs.')
196
191
  return
197
192
 
198
193
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
@@ -210,11 +205,11 @@ class JobsController:
210
205
  managed_job_state.set_local_log_file(self._job_id, task_id,
211
206
  log_file)
212
207
  else:
213
- self._logger.warning(
208
+ logger.warning(
214
209
  f'No log file was downloaded for job {self._job_id}, '
215
210
  f'task {task_id}')
216
211
 
217
- self._logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
212
+ logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
218
213
 
219
214
  async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
220
215
  if cluster_name is None:
@@ -259,7 +254,7 @@ class JobsController:
259
254
  Other exceptions may be raised depending on the backend.
260
255
  """
261
256
  task_start_time = time.time()
262
- self._logger.info(
257
+ logger.info(
263
258
  f'Starting task {task_id} ({task.name}) for job {self._job_id}')
264
259
 
265
260
  latest_task_id, last_task_prev_status = (
@@ -271,22 +266,20 @@ class JobsController:
271
266
  managed_job_state.ManagedJobStatus.PENDING):
272
267
  assert latest_task_id >= task_id, (latest_task_id, task_id)
273
268
  if latest_task_id > task_id:
274
- self._logger.info(f'Task {task_id} ({task.name}) has already '
275
- 'been executed. Skipping...')
269
+ logger.info(f'Task {task_id} ({task.name}) has already '
270
+ 'been executed. Skipping...')
276
271
  return True
277
272
  if latest_task_id == task_id:
278
273
  # Start recovery.
279
274
  is_resume = True
280
- self._logger.info(
281
- f'Resuming task {task_id} from previous execution')
275
+ logger.info(f'Resuming task {task_id} from previous execution')
282
276
 
283
277
  callback_func = managed_job_utils.event_callback_func(
284
278
  job_id=self._job_id, task_id=task_id, task=task)
285
279
 
286
280
  if task.run is None:
287
- self._logger.info(
288
- f'Skip running task {task_id} ({task.name}) due to its '
289
- 'run commands being empty.')
281
+ logger.info(f'Skip running task {task_id} ({task.name}) due to its '
282
+ 'run commands being empty.')
290
283
  # Call set_started first to initialize columns in the state table,
291
284
  # including start_at and last_recovery_at to avoid issues for
292
285
  # uninitialized columns.
@@ -300,8 +293,7 @@ class JobsController:
300
293
  task_id=task_id,
301
294
  end_time=time.time(),
302
295
  callback_func=callback_func)
303
- self._logger.info(
304
- f'Empty task {task_id} marked as succeeded immediately')
296
+ logger.info(f'Empty task {task_id} marked as succeeded immediately')
305
297
  return True
306
298
 
307
299
  usage_lib.messages.usage.update_task_id(task_id)
@@ -314,8 +306,7 @@ class JobsController:
314
306
  task.name, self._job_id) if self._pool is None else None
315
307
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
316
308
  cluster_name, self._backend, task, self._job_id, task_id,
317
- self._logger, self._pool, self.starting, self.starting_lock,
318
- self.starting_signal)
309
+ self._pool, self.starting, self.starting_lock, self.starting_signal)
319
310
  if not is_resume:
320
311
  submitted_at = time.time()
321
312
  if task_id == 0:
@@ -336,11 +327,11 @@ class JobsController:
336
327
  self._strategy_executor.max_restarts_on_errors
337
328
  },
338
329
  callback_func=callback_func)
339
- self._logger.info(f'Submitted managed job {self._job_id} '
340
- f'(task: {task_id}, name: {task.name!r}); '
341
- f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
330
+ logger.info(f'Submitted managed job {self._job_id} '
331
+ f'(task: {task_id}, name: {task.name!r}); '
332
+ f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
342
333
 
343
- self._logger.info('Started monitoring.')
334
+ logger.info('Started monitoring.')
344
335
 
345
336
  # Only do the initial cluster launch if not resuming from a controller
346
337
  # failure. Otherwise, we will transit to recovering immediately.
@@ -354,7 +345,7 @@ class JobsController:
354
345
  remote_job_submitted_at = await self._strategy_executor.launch()
355
346
 
356
347
  launch_time = time.time() - launch_start
357
- self._logger.info(f'Cluster launch completed in {launch_time:.2f}s')
348
+ logger.info(f'Cluster launch completed in {launch_time:.2f}s')
358
349
  assert remote_job_submitted_at is not None, remote_job_submitted_at
359
350
  if self._pool is None:
360
351
  job_id_on_pool_cluster = None
@@ -367,16 +358,16 @@ class JobsController:
367
358
  # Check if we have been cancelled here, in the case where a user
368
359
  # quickly cancels the job we want to gracefully handle it here,
369
360
  # otherwise we will end up in the FAILED_CONTROLLER state.
370
- self._logger.info(f'Cluster name is None for job {self._job_id}, '
371
- f'task {task_id}. Checking if we have been '
372
- 'cancelled.')
361
+ logger.info(f'Cluster name is None for job {self._job_id}, '
362
+ f'task {task_id}. Checking if we have been '
363
+ 'cancelled.')
373
364
  status = await (managed_job_state.get_job_status_with_task_id_async(
374
365
  job_id=self._job_id, task_id=task_id))
375
- self._logger.debug(f'Status for job {self._job_id}, task {task_id}:'
376
- f'{status}')
366
+ logger.debug(f'Status for job {self._job_id}, task {task_id}:'
367
+ f'{status}')
377
368
  if status == managed_job_state.ManagedJobStatus.CANCELLED:
378
- self._logger.info(f'Job {self._job_id}, task {task_id} has '
379
- 'been quickly cancelled.')
369
+ logger.info(f'Job {self._job_id}, task {task_id} has '
370
+ 'been quickly cancelled.')
380
371
  raise asyncio.CancelledError()
381
372
  assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
382
373
 
@@ -417,7 +408,7 @@ class JobsController:
417
408
 
418
409
  if prev_status is not None:
419
410
  if prev_status.is_terminal():
420
- self._logger.info(
411
+ logger.info(
421
412
  f'Task {task_id} already in terminal state: '
422
413
  f'{prev_status}')
423
414
  return (prev_status ==
@@ -427,9 +418,8 @@ class JobsController:
427
418
  # If the controller is down when cancelling the job,
428
419
  # we re-raise the error to run the `_cleanup` function
429
420
  # again to clean up any remaining resources.
430
- self._logger.info(
431
- f'Task {task_id} was being cancelled, '
432
- 're-raising cancellation')
421
+ logger.info(f'Task {task_id} was being cancelled, '
422
+ 're-raising cancellation')
433
423
  raise asyncio.CancelledError()
434
424
  if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
435
425
  force_transit_to_recovering = True
@@ -443,10 +433,9 @@ class JobsController:
443
433
  try:
444
434
  await backend_utils.async_check_network_connection()
445
435
  except exceptions.NetworkError:
446
- self._logger.info(
447
- 'Network is not available. Retrying again in '
448
- f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
449
- 'seconds.')
436
+ logger.info('Network is not available. Retrying again in '
437
+ f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
438
+ 'seconds.')
450
439
  continue
451
440
 
452
441
  # NOTE: we do not check cluster status first because race condition
@@ -461,23 +450,22 @@ class JobsController:
461
450
  self._backend,
462
451
  cluster_name,
463
452
  job_id=job_id_on_pool_cluster,
464
- job_logger=self._logger,
465
453
  )
466
454
  except exceptions.FetchClusterInfoError as fetch_e:
467
- self._logger.info(
455
+ logger.info(
468
456
  'Failed to fetch the job status. Start recovery.\n'
469
457
  f'Exception: {common_utils.format_exception(fetch_e)}\n'
470
458
  f'Traceback: {traceback.format_exc()}')
471
459
 
472
460
  if job_status == job_lib.JobStatus.SUCCEEDED:
473
- self._logger.info(f'Task {task_id} succeeded! '
474
- 'Getting end time and cleaning up')
461
+ logger.info(f'Task {task_id} succeeded! '
462
+ 'Getting end time and cleaning up')
475
463
  try:
476
464
  success_end_time = await context_utils.to_thread(
477
465
  managed_job_utils.try_to_get_job_end_time,
478
466
  self._backend, cluster_name, job_id_on_pool_cluster)
479
467
  except Exception as e: # pylint: disable=broad-except
480
- self._logger.warning(
468
+ logger.warning(
481
469
  f'Failed to get job end time: '
482
470
  f'{common_utils.format_exception(e)}',
483
471
  exc_info=True)
@@ -490,7 +478,7 @@ class JobsController:
490
478
  task_id,
491
479
  end_time=success_end_time,
492
480
  callback_func=callback_func)
493
- self._logger.info(
481
+ logger.info(
494
482
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
495
483
  f'Cleaning up the cluster {cluster_name}.')
496
484
  try:
@@ -511,7 +499,7 @@ class JobsController:
511
499
  job_id_on_pool_cluster)
512
500
  except Exception as e: # pylint: disable=broad-except
513
501
  # We don't want to crash here, so just log and continue.
514
- self._logger.warning(
502
+ logger.warning(
515
503
  f'Failed to download and stream logs: '
516
504
  f'{common_utils.format_exception(e)}',
517
505
  exc_info=True)
@@ -521,10 +509,10 @@ class JobsController:
521
509
 
522
510
  task_total_time = time.time() - task_start_time
523
511
  monitoring_time = time.time() - monitoring_start_time
524
- self._logger.info(f'Task {task_id} completed successfully in '
525
- f'{task_total_time:.2f}s '
526
- f'(monitoring time: {monitoring_time:.2f}s, '
527
- f'status checks: {status_check_count})')
512
+ logger.info(f'Task {task_id} completed successfully in '
513
+ f'{task_total_time:.2f}s '
514
+ f'(monitoring time: {monitoring_time:.2f}s, '
515
+ f'status checks: {status_check_count})')
528
516
  return True
529
517
 
530
518
  # For single-node jobs, non-terminated job_status indicates a
@@ -560,7 +548,7 @@ class JobsController:
560
548
  # code).
561
549
  cluster_status_str = ('' if cluster_status is None else
562
550
  f' (status: {cluster_status.value})')
563
- self._logger.info(
551
+ logger.info(
564
552
  f'Cluster is preempted or failed{cluster_status_str}. '
565
553
  'Recovering...')
566
554
  else:
@@ -571,12 +559,12 @@ class JobsController:
571
559
  in job_lib.JobStatus.user_code_failure_states() or
572
560
  job_status == job_lib.JobStatus.FAILED_DRIVER):
573
561
  # The user code has probably crashed, fail immediately.
574
- self._logger.info(
562
+ logger.info(
575
563
  f'Task {task_id} failed with status: {job_status}')
576
564
  end_time = await context_utils.to_thread(
577
565
  managed_job_utils.try_to_get_job_end_time,
578
566
  self._backend, cluster_name, job_id_on_pool_cluster)
579
- self._logger.info(
567
+ logger.info(
580
568
  f'The user job failed ({job_status}). Please check the '
581
569
  'logs below.\n'
582
570
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
@@ -611,7 +599,7 @@ class JobsController:
611
599
  if should_restart_on_failure:
612
600
  max_restarts = (
613
601
  self._strategy_executor.max_restarts_on_errors)
614
- self._logger.info(
602
+ logger.info(
615
603
  f'User program crashed '
616
604
  f'({managed_job_status.value}). '
617
605
  f'Retry the job as max_restarts_on_errors is '
@@ -619,7 +607,7 @@ class JobsController:
619
607
  f'[{self._strategy_executor.restart_cnt_on_failure}'
620
608
  f'/{max_restarts}]')
621
609
  else:
622
- self._logger.info(
610
+ logger.info(
623
611
  f'Task {task_id} failed and will not be retried')
624
612
  await managed_job_state.set_failed_async(
625
613
  self._job_id,
@@ -632,7 +620,7 @@ class JobsController:
632
620
  elif job_status is not None:
633
621
  # Either the job is cancelled (should not happen) or in some
634
622
  # unknown new state that we do not handle.
635
- self._logger.error(f'Unknown job status: {job_status}')
623
+ logger.error(f'Unknown job status: {job_status}')
636
624
  failure_reason = (
637
625
  f'Unknown job status {job_status}. To see the details, '
638
626
  f'run: sky jobs logs --controller {self._job_id}')
@@ -649,10 +637,9 @@ class JobsController:
649
637
  # job status. Try to recover the job (will not restart the
650
638
  # cluster, if the cluster is healthy).
651
639
  assert job_status is None, job_status
652
- self._logger.info(
653
- 'Failed to fetch the job status while the '
654
- 'cluster is healthy. Try to recover the job '
655
- '(the cluster will not be restarted).')
640
+ logger.info('Failed to fetch the job status while the '
641
+ 'cluster is healthy. Try to recover the job '
642
+ '(the cluster will not be restarted).')
656
643
  # When the handle is None, the cluster should be cleaned up already.
657
644
  if handle is not None:
658
645
  resources = handle.launched_resources
@@ -671,15 +658,14 @@ class JobsController:
671
658
  # Some spot resource (e.g., Spot TPU VM) may need to be
672
659
  # cleaned up after preemption, as running launch again on
673
660
  # those clusters again may fail.
674
- self._logger.info(
675
- 'Cleaning up the preempted or failed cluster'
676
- '...')
661
+ logger.info('Cleaning up the preempted or failed cluster'
662
+ '...')
677
663
  await self._cleanup_cluster(cluster_name)
678
664
 
679
665
  # Try to recover the managed jobs, when the cluster is preempted or
680
666
  # failed or the job status is failed to be fetched.
681
- self._logger.info(f'Starting recovery for task {task_id}, '
682
- f'it is currently {job_status}')
667
+ logger.info(f'Starting recovery for task {task_id}, '
668
+ f'it is currently {job_status}')
683
669
  await managed_job_state.set_recovering_async(
684
670
  job_id=self._job_id,
685
671
  task_id=task_id,
@@ -701,7 +687,7 @@ class JobsController:
701
687
 
702
688
  async def run(self):
703
689
  """Run controller logic and handle exceptions."""
704
- self._logger.info(f'Starting JobsController run for job {self._job_id}')
690
+ logger.info(f'Starting JobsController run for job {self._job_id}')
705
691
  task_id = 0
706
692
  cancelled = False
707
693
 
@@ -709,39 +695,36 @@ class JobsController:
709
695
  succeeded = True
710
696
  # We support chain DAGs only for now.
711
697
  for task_id, task in enumerate(self._dag.tasks):
712
- self._logger.info(
698
+ logger.info(
713
699
  f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
714
700
  f'{task.name}')
715
701
  task_start = time.time()
716
702
  succeeded = await self._run_one_task(task_id, task)
717
703
  task_time = time.time() - task_start
718
- self._logger.info(
719
- f'Task {task_id} completed in {task_time:.2f}s '
720
- f'with success={succeeded}')
704
+ logger.info(f'Task {task_id} completed in {task_time:.2f}s '
705
+ f'with success={succeeded}')
721
706
 
722
707
  if not succeeded:
723
- self._logger.info(
724
- f'Task {task_id} failed, stopping execution')
708
+ logger.info(f'Task {task_id} failed, stopping execution')
725
709
  break
726
710
 
727
711
  except exceptions.ProvisionPrechecksError as e:
728
712
  # Please refer to the docstring of self._run for the cases when
729
713
  # this exception can occur.
730
- self._logger.error(f'Provision prechecks failed for task {task_id}')
714
+ logger.error(f'Provision prechecks failed for task {task_id}')
731
715
  failure_reason = ('; '.join(
732
716
  common_utils.format_exception(reason, use_bracket=True)
733
717
  for reason in e.reasons))
734
- self._logger.error(failure_reason)
718
+ logger.error(failure_reason)
735
719
  await self._update_failed_task_state(
736
720
  task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
737
721
  failure_reason)
738
722
  except exceptions.ManagedJobReachedMaxRetriesError as e:
739
723
  # Please refer to the docstring of self._run for the cases when
740
724
  # this exception can occur.
741
- self._logger.error(
742
- f'Managed job reached max retries for task {task_id}')
725
+ logger.error(f'Managed job reached max retries for task {task_id}')
743
726
  failure_reason = common_utils.format_exception(e)
744
- self._logger.error(failure_reason)
727
+ logger.error(failure_reason)
745
728
  # The managed job should be marked as FAILED_NO_RESOURCE, as the
746
729
  # managed job may be able to launch next time.
747
730
  await self._update_failed_task_state(
@@ -753,13 +736,13 @@ class JobsController:
753
736
  cancelled = True
754
737
  raise
755
738
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
756
- self._logger.error(
739
+ logger.error(
757
740
  f'Unexpected error in JobsController run for task {task_id}')
758
741
  with ux_utils.enable_traceback():
759
- self._logger.error(traceback.format_exc())
742
+ logger.error(traceback.format_exc())
760
743
  msg = ('Unexpected error occurred: ' +
761
744
  common_utils.format_exception(e, use_bracket=True))
762
- self._logger.error(msg)
745
+ logger.error(msg)
763
746
  await self._update_failed_task_state(
764
747
  task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
765
748
  msg)
@@ -783,8 +766,8 @@ class JobsController:
783
766
  failure_type: managed_job_state.ManagedJobStatus,
784
767
  failure_reason: str):
785
768
  """Update the state of the failed task."""
786
- self._logger.info(f'Updating failed task state: task_id={task_id}, '
787
- f'failure_type={failure_type}')
769
+ logger.info(f'Updating failed task state: task_id={task_id}, '
770
+ f'failure_type={failure_type}')
788
771
  await managed_job_state.set_failed_async(
789
772
  self._job_id,
790
773
  task_id=task_id,
@@ -799,7 +782,8 @@ class JobsController:
799
782
  class Controller:
800
783
  """Controller for managing jobs."""
801
784
 
802
- def __init__(self) -> None:
785
+ def __init__(self, controller_uuid: str) -> None:
786
+ self._controller_uuid = controller_uuid
803
787
  # Global state for active jobs
804
788
  self.job_tasks: Dict[int, asyncio.Task] = {}
805
789
  self.starting: Set[int] = set()
@@ -813,10 +797,11 @@ class Controller:
813
797
  # launch).
814
798
  self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
815
799
 
800
+ self._pid = os.getpid()
801
+
816
802
  async def _cleanup(self,
817
803
  job_id: int,
818
804
  dag_yaml: str,
819
- job_logger: logging.Logger,
820
805
  pool: Optional[str] = None):
821
806
  """Clean up the cluster(s) and storages.
822
807
 
@@ -842,14 +827,13 @@ class Controller:
842
827
  cluster_name = (
843
828
  managed_job_utils.generate_managed_job_cluster_name(
844
829
  task.name, job_id))
845
- managed_job_utils.terminate_cluster(cluster_name,
846
- _logger=job_logger)
830
+ managed_job_utils.terminate_cluster(cluster_name)
847
831
  status = core.status(cluster_names=[cluster_name],
848
832
  all_users=True)
849
833
  assert (len(status) == 0 or
850
834
  status[0]['status'] == sky.ClusterStatus.STOPPED), (
851
835
  f'{cluster_name} is not down: {status}')
852
- job_logger.info(f'{cluster_name} is down')
836
+ logger.info(f'{cluster_name} is down')
853
837
  else:
854
838
  cluster_name, job_id_on_pool_cluster = (
855
839
  managed_job_state.get_pool_submit_info(job_id))
@@ -860,7 +844,7 @@ class Controller:
860
844
  _try_cancel_if_cluster_is_init=True)
861
845
  except Exception as e: # pylint: disable=broad-except
862
846
  error = e
863
- job_logger.warning(
847
+ logger.warning(
864
848
  f'Failed to terminate cluster {cluster_name}: {e}')
865
849
  # we continue to try cleaning up whatever else we can.
866
850
  # Clean up Storages with persistent=False.
@@ -874,7 +858,7 @@ class Controller:
874
858
  for storage in task.storage_mounts.values():
875
859
  storage.construct()
876
860
  except (exceptions.StorageSpecError, exceptions.StorageError) as e:
877
- job_logger.warning(
861
+ logger.warning(
878
862
  f'Failed to construct storage object for teardown: {e}\n'
879
863
  'This may happen because storage construction already '
880
864
  'failed during launch, storage was deleted externally, '
@@ -884,7 +868,7 @@ class Controller:
884
868
  backend.teardown_ephemeral_storage(task)
885
869
  except Exception as e: # pylint: disable=broad-except
886
870
  error = e
887
- job_logger.warning(f'Failed to teardown ephemeral storage: {e}')
871
+ logger.warning(f'Failed to teardown ephemeral storage: {e}')
888
872
  # we continue to try cleaning up whatever else we can.
889
873
 
890
874
  # Clean up any files mounted from the local disk, such as two-hop
@@ -902,7 +886,7 @@ class Controller:
902
886
  else:
903
887
  os.remove(path)
904
888
  except Exception as e: # pylint: disable=broad-except
905
- job_logger.warning(
889
+ logger.warning(
906
890
  f'Failed to clean up file mount {file_mount}: {e}')
907
891
 
908
892
  if error is not None:
@@ -924,11 +908,10 @@ class Controller:
924
908
 
925
909
  # Use context.contextual to enable per-job output redirection and env var
926
910
  # isolation.
927
- @context.contextual
911
+ @context.contextual_async
928
912
  async def run_job_loop(self,
929
913
  job_id: int,
930
914
  dag_yaml: str,
931
- job_logger: logging.Logger,
932
915
  log_file: str,
933
916
  env_file_path: Optional[str] = None,
934
917
  pool: Optional[str] = None):
@@ -937,45 +920,52 @@ class Controller:
937
920
  assert ctx is not None, 'Context is not initialized'
938
921
  ctx.redirect_log(pathlib.Path(log_file))
939
922
 
923
+ logger.info(f'Starting job loop for {job_id}')
924
+ logger.info(f' dag_yaml={dag_yaml}')
925
+ logger.info(f' log_file={log_file}')
926
+ logger.info(f' env_file_path={env_file_path}')
927
+ logger.info(f' pool={pool}')
928
+ logger.info(f'From controller {self._controller_uuid}')
929
+ logger.info(f' pid={self._pid}')
930
+
940
931
  # Load and apply environment variables from the job's environment file
941
932
  if env_file_path and os.path.exists(env_file_path):
942
933
  try:
943
934
  # Load environment variables from the file
944
935
  env_vars = dotenv.dotenv_values(env_file_path)
945
- job_logger.info(f'Loading environment from {env_file_path}: '
946
- f'{list(env_vars.keys())}')
936
+ logger.info(f'Loading environment from {env_file_path}: '
937
+ f'{list(env_vars.keys())}')
947
938
 
948
939
  # Apply environment variables to the job's context
949
940
  if ctx is not None:
950
941
  for key, value in env_vars.items():
951
942
  if value is not None:
952
943
  ctx.override_envs({key: value})
953
- job_logger.debug(
944
+ logger.debug(
954
945
  f'Set environment variable: {key}={value}')
955
946
  # Reload the skypilot config for this context to make sure
956
947
  # the latest config is used.
957
948
  skypilot_config.reload_config()
958
949
  else:
959
- job_logger.error(
950
+ logger.error(
960
951
  'Context is None, cannot set environment variables')
961
952
  except Exception as e: # pylint: disable=broad-except
962
- job_logger.error(
953
+ logger.error(
963
954
  f'Failed to load environment file {env_file_path}: {e}')
964
955
  elif env_file_path:
965
- job_logger.error(f'Environment file not found: {env_file_path}')
956
+ logger.error(f'Environment file not found: {env_file_path}')
966
957
 
967
958
  cancelling = False
968
959
  try:
969
- job_logger.info(f'Starting job loop for {job_id}')
960
+ logger.info(f'Starting job loop for {job_id}')
970
961
 
971
- controller = JobsController(job_id, dag_yaml, job_logger,
972
- self.starting, self._job_tasks_lock,
962
+ controller = JobsController(job_id, dag_yaml, self.starting,
963
+ self._job_tasks_lock,
973
964
  self._starting_signal, pool)
974
965
 
975
966
  async with self._job_tasks_lock:
976
967
  if job_id in self.job_tasks:
977
- job_logger.error(
978
- f'Job {job_id} already exists in job_tasks')
968
+ logger.error(f'Job {job_id} already exists in job_tasks')
979
969
  raise ValueError(f'Job {job_id} already exists')
980
970
 
981
971
  # Create the task and store it
@@ -985,13 +975,13 @@ class Controller:
985
975
  self.job_tasks[job_id] = task
986
976
  await task
987
977
  except asyncio.CancelledError:
988
- job_logger.info(f'Job {job_id} was cancelled')
978
+ logger.info(f'Job {job_id} was cancelled')
989
979
  dag, _ = _get_dag_and_name(dag_yaml)
990
980
  task_id, _ = await (
991
981
  managed_job_state.get_latest_task_id_status_async(job_id))
992
982
  assert task_id is not None, job_id
993
- job_logger.info(f'Cancelling managed job, job_id: {job_id}, '
994
- f'task_id: {task_id}')
983
+ logger.info(f'Cancelling managed job, job_id: {job_id}, '
984
+ f'task_id: {task_id}')
995
985
  await managed_job_state.set_cancelling_async(
996
986
  job_id=job_id,
997
987
  callback_func=managed_job_utils.event_callback_func(
@@ -999,16 +989,13 @@ class Controller:
999
989
  cancelling = True
1000
990
  raise
1001
991
  except Exception as e:
1002
- job_logger.error(f'Unexpected error in job loop for {job_id}: '
1003
- f'{common_utils.format_exception(e)}')
992
+ logger.error(f'Unexpected error in job loop for {job_id}: '
993
+ f'{common_utils.format_exception(e)}')
1004
994
  raise
1005
995
  finally:
1006
996
  try:
1007
- await self._cleanup(job_id,
1008
- dag_yaml=dag_yaml,
1009
- job_logger=job_logger,
1010
- pool=pool)
1011
- job_logger.info(
997
+ await self._cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
998
+ logger.info(
1012
999
  f'Cluster of managed job {job_id} has been cleaned up.')
1013
1000
  except Exception as e: # pylint: disable=broad-except
1014
1001
  failure_reason = ('Failed to clean up: '
@@ -1037,7 +1024,7 @@ class Controller:
1037
1024
  # The job can be non-terminal if the controller exited abnormally,
1038
1025
  # e.g. failed to launch cluster after reaching the MAX_RETRY.
1039
1026
  if not job_status.is_terminal():
1040
- job_logger.info(f'Previous job status: {job_status.value}')
1027
+ logger.info(f'Previous job status: {job_status.value}')
1041
1028
  await managed_job_state.set_failed_async(
1042
1029
  job_id,
1043
1030
  task_id=None,
@@ -1080,37 +1067,20 @@ class Controller:
1080
1067
  dag_yaml: Path to the YAML file containing the DAG definition.
1081
1068
  env_file_path: Optional path to environment file for the job.
1082
1069
  """
1083
- # Create a job-specific logger
1070
+ # Create log file path for job output redirection
1084
1071
  log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
1085
1072
  os.makedirs(log_dir, exist_ok=True)
1086
1073
  log_file = os.path.join(log_dir, f'{job_id}.log')
1087
1074
 
1088
- job_logger = logging.getLogger(f'sky.jobs.{job_id}')
1089
- job_logger.setLevel(logging.DEBUG)
1090
-
1091
- # Create file handler
1092
- file_handler = logging.FileHandler(log_file)
1093
- file_handler.setLevel(logging.DEBUG)
1094
-
1095
- # Use Sky's standard formatter
1096
- file_handler.setFormatter(sky_logging.FORMATTER)
1097
-
1098
- # Add the handler to the logger
1099
- job_logger.addHandler(file_handler)
1100
-
1101
- # Prevent log propagation to avoid duplicate logs
1102
- job_logger.propagate = False
1103
-
1104
- job_logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
1105
- f'env_file_path={env_file_path}')
1075
+ logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
1076
+ f'env_file_path={env_file_path}, and log_file={log_file}')
1106
1077
 
1107
1078
  async with self._job_tasks_lock:
1108
1079
  self.starting.add(job_id)
1109
1080
  await create_background_task(
1110
- self.run_job_loop(job_id, dag_yaml, job_logger, log_file,
1111
- env_file_path, pool))
1081
+ self.run_job_loop(job_id, dag_yaml, log_file, env_file_path, pool))
1112
1082
 
1113
- job_logger.info(f'Job {job_id} started successfully')
1083
+ logger.info(f'Job {job_id} started successfully')
1114
1084
 
1115
1085
  async def cancel_job(self):
1116
1086
  """Cancel an existing job."""
@@ -1161,6 +1131,7 @@ class Controller:
1161
1131
  scheduler.get_number_of_controllers()))
1162
1132
 
1163
1133
  if len(running_tasks) >= max_jobs:
1134
+ logger.info('Too many jobs running, waiting for 60 seconds')
1164
1135
  await asyncio.sleep(60)
1165
1136
  continue
1166
1137
 
@@ -1174,9 +1145,11 @@ class Controller:
1174
1145
  continue
1175
1146
 
1176
1147
  if waiting_job is None:
1148
+ logger.info('No waiting job, waiting for 10 seconds')
1177
1149
  await asyncio.sleep(10)
1178
1150
  continue
1179
1151
 
1152
+ logger.info(f'Claiming job {waiting_job["job_id"]}')
1180
1153
  job_id = waiting_job['job_id']
1181
1154
  dag_yaml_path = waiting_job['dag_yaml_path']
1182
1155
  env_file_path = waiting_job.get('env_file_path')
@@ -1202,10 +1175,12 @@ class Controller:
1202
1175
  await self.start_job(job_id, dag_yaml_path, env_file_path, pool)
1203
1176
 
1204
1177
 
1205
- async def main():
1178
+ async def main(controller_uuid: str):
1179
+ logger.info(f'Starting controller {controller_uuid}')
1180
+
1206
1181
  context_utils.hijack_sys_attrs()
1207
1182
 
1208
- controller = Controller()
1183
+ controller = Controller(controller_uuid)
1209
1184
 
1210
1185
  # Will happen multiple times, who cares though
1211
1186
  os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
@@ -1214,6 +1189,8 @@ async def main():
1214
1189
  soft = None
1215
1190
  try:
1216
1191
  soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
1192
+ logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
1193
+ logger.info(f'Increasing soft limit to {hard}')
1217
1194
  resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
1218
1195
  except OSError as e:
1219
1196
  logger.warning(f'Failed to increase number of files we can open: {e}\n'
@@ -1231,4 +1208,4 @@ async def main():
1231
1208
 
1232
1209
 
1233
1210
  if __name__ == '__main__':
1234
- asyncio.run(main())
1211
+ asyncio.run(main(sys.argv[1]))