skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,80 @@
1
+ """Utilities for managing managed job file content.
2
+
3
+ The helpers in this module fetch job file content (DAG YAML/env files) from the
4
+ database-first storage added for managed jobs, transparently falling back to
5
+ legacy on-disk paths when needed. Consumers should prefer the string-based
6
+ helpers so controllers never have to rely on local disk state.
7
+ """
8
+
9
+ import os
10
+ from typing import Optional
11
+
12
+ from sky import sky_logging
13
+ from sky.jobs import state as managed_job_state
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+
18
+ def get_job_dag_content(job_id: int) -> Optional[str]:
19
+ """Get DAG YAML content for a job from database or disk.
20
+
21
+ Args:
22
+ job_id: The job ID
23
+
24
+ Returns:
25
+ DAG YAML content as string, or None if not found
26
+ """
27
+ file_info = managed_job_state.get_job_file_contents(job_id)
28
+
29
+ # Prefer content stored in the database
30
+ if file_info['dag_yaml_content'] is not None:
31
+ return file_info['dag_yaml_content']
32
+
33
+ # Fallback to disk path for backward compatibility
34
+ dag_yaml_path = file_info.get('dag_yaml_path')
35
+ if dag_yaml_path and os.path.exists(dag_yaml_path):
36
+ try:
37
+ with open(dag_yaml_path, 'r', encoding='utf-8') as f:
38
+ content = f.read()
39
+ logger.debug('Loaded DAG YAML from disk for job %s: %s', job_id,
40
+ dag_yaml_path)
41
+ return content
42
+ except (FileNotFoundError, IOError, OSError) as e:
43
+ logger.warning(
44
+ f'Failed to read DAG YAML from disk {dag_yaml_path}: {e}')
45
+
46
+ logger.warning(f'DAG YAML content not found for job {job_id}')
47
+ return None
48
+
49
+
50
+ def get_job_env_content(job_id: int) -> Optional[str]:
51
+ """Get environment file content for a job from database or disk.
52
+
53
+ Args:
54
+ job_id: The job ID
55
+
56
+ Returns:
57
+ Environment file content as string, or None if not found
58
+ """
59
+ file_info = managed_job_state.get_job_file_contents(job_id)
60
+
61
+ # Prefer content stored in the database
62
+ if file_info['env_file_content'] is not None:
63
+ return file_info['env_file_content']
64
+
65
+ # Fallback to disk path for backward compatibility
66
+ env_file_path = file_info.get('env_file_path')
67
+ if env_file_path and os.path.exists(env_file_path):
68
+ try:
69
+ with open(env_file_path, 'r', encoding='utf-8') as f:
70
+ content = f.read()
71
+ logger.debug('Loaded environment file from disk for job %s: %s',
72
+ job_id, env_file_path)
73
+ return content
74
+ except (FileNotFoundError, IOError, OSError) as e:
75
+ logger.warning(
76
+ f'Failed to read environment file from disk {env_file_path}: '
77
+ f'{e}')
78
+
79
+ # Environment file is optional, so don't warn if not found
80
+ return None
sky/jobs/log_gc.py ADDED
@@ -0,0 +1,201 @@
1
+ """Log garbage collection for managed jobs."""
2
+
3
+ import asyncio
4
+ from datetime import datetime
5
+ import os
6
+ import pathlib
7
+ import shutil
8
+ import time
9
+
10
+ import anyio
11
+ import filelock
12
+
13
+ from sky import sky_logging
14
+ from sky import skypilot_config
15
+ from sky.jobs import constants as managed_job_constants
16
+ from sky.jobs import state as managed_job_state
17
+ from sky.jobs import utils as managed_job_utils
18
+ from sky.utils import context
19
+ from sky.utils import context_utils
20
+
21
+ logger = sky_logging.init_logger(__name__)
22
+
23
+ # Filelock for garbage collector leader election.
24
+ _JOB_CONTROLLER_GC_LOCK_PATH = os.path.expanduser(
25
+ '~/.sky/locks/job_controller_gc.lock')
26
+
27
+ _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS = 24 * 7
28
+ _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS = 24 * 7
29
+
30
+ _LEAST_FREQUENT_GC_INTERVAL_SECONDS = 3600
31
+ _MOST_FREQUENT_GC_INTERVAL_SECONDS = 30
32
+
33
+
34
+ def _next_gc_interval(retention_seconds: int) -> int:
35
+ """Get the next GC interval."""
36
+ # Run the GC at least per hour to ensure hourly accuracy and
37
+ # at most per 30 seconds (when retention_seconds is small) to
38
+ # avoid too frequent cleanup.
39
+ return max(min(retention_seconds, _LEAST_FREQUENT_GC_INTERVAL_SECONDS),
40
+ _MOST_FREQUENT_GC_INTERVAL_SECONDS)
41
+
42
+
43
+ async def gc_controller_logs_for_job():
44
+ """Garbage collect job and controller logs."""
45
+ while True:
46
+ skypilot_config.reload_config()
47
+ controller_logs_retention = skypilot_config.get_nested(
48
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
49
+ _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS) * 3600
50
+ # Negative value disables the GC
51
+ if controller_logs_retention >= 0:
52
+ logger.info(f'GC controller logs for job: retention '
53
+ f'{controller_logs_retention} seconds')
54
+ try:
55
+ finished = False
56
+ while not finished:
57
+ finished = await _clean_controller_logs_with_retention(
58
+ controller_logs_retention)
59
+ except asyncio.CancelledError:
60
+ logger.info('Managed jobs logs GC task cancelled')
61
+ break
62
+ except Exception as e: # pylint: disable=broad-except
63
+ logger.error(f'Error GC controller logs for job: {e}',
64
+ exc_info=True)
65
+ else:
66
+ logger.info('Controller logs GC is disabled')
67
+
68
+ interval = _next_gc_interval(controller_logs_retention)
69
+ logger.info('Next controller logs GC is scheduled after '
70
+ f'{interval} seconds')
71
+ await asyncio.sleep(interval)
72
+
73
+
74
+ async def gc_task_logs_for_job():
75
+ """Garbage collect task logs for job."""
76
+ while True:
77
+ skypilot_config.reload_config()
78
+ task_logs_retention = skypilot_config.get_nested(
79
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
80
+ _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS) * 3600
81
+ # Negative value disables the GC
82
+ if task_logs_retention >= 0:
83
+ logger.info('GC task logs for job: '
84
+ f'retention {task_logs_retention} seconds')
85
+ try:
86
+ finished = False
87
+ while not finished:
88
+ finished = await _clean_task_logs_with_retention(
89
+ task_logs_retention)
90
+ except asyncio.CancelledError:
91
+ logger.info('Task logs GC task cancelled')
92
+ break
93
+ except Exception as e: # pylint: disable=broad-except
94
+ logger.error(f'Error GC task logs for job: {e}', exc_info=True)
95
+ else:
96
+ logger.info('Controller logs GC is disabled')
97
+
98
+ interval = _next_gc_interval(task_logs_retention)
99
+ logger.info(f'Next task logs GC is scheduled after {interval} seconds')
100
+ await asyncio.sleep(_next_gc_interval(task_logs_retention))
101
+
102
+
103
+ async def _clean_controller_logs_with_retention(retention_seconds: int,
104
+ batch_size: int = 100):
105
+ """Clean controller logs with retention.
106
+
107
+ Returns:
108
+ Whether the GC of this round has finished, False means there might
109
+ still be more controller logs to clean.
110
+ """
111
+ assert batch_size > 0, 'Batch size must be positive'
112
+ jobs = await managed_job_state.get_controller_logs_to_clean_async(
113
+ retention_seconds, batch_size=batch_size)
114
+ job_ids_to_update = []
115
+ for job in jobs:
116
+ job_ids_to_update.append(job['job_id'])
117
+ log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
118
+ cleaned_at = time.time()
119
+ if await anyio.Path(log_file).exists():
120
+ ts_str = datetime.fromtimestamp(cleaned_at).strftime(
121
+ '%Y-%m-%d %H:%M:%S')
122
+ msg = f'Controller log has been cleaned at {ts_str}.'
123
+ # Sync down logs will reference to this file directly, so we
124
+ # keep the file and delete the content.
125
+ # TODO(aylei): refactor sync down logs if the inode usage
126
+ # becomes an issue.
127
+ async with await anyio.open_file(log_file, 'w',
128
+ encoding='utf-8') as f:
129
+ await f.write(msg + '\n')
130
+ # Batch the update, the timestamp will be not accurate but it's okay.
131
+ await managed_job_state.set_controller_logs_cleaned_async(
132
+ job_ids=job_ids_to_update, logs_cleaned_at=time.time())
133
+ complete = len(jobs) < batch_size
134
+ logger.info(f'Cleaned {len(jobs)} controller logs with retention '
135
+ f'{retention_seconds} seconds, complete: {complete}')
136
+ return complete
137
+
138
+
139
+ async def _clean_task_logs_with_retention(retention_seconds: int,
140
+ batch_size: int = 100):
141
+ """Clean task logs with retention.
142
+
143
+ Returns:
144
+ Whether the GC of this round has finished, False means there might
145
+ still be more task logs to clean.
146
+ """
147
+ assert batch_size > 0, 'Batch size must be positive'
148
+ tasks = await managed_job_state.get_task_logs_to_clean_async(
149
+ retention_seconds, batch_size=batch_size)
150
+ tasks_to_update = []
151
+ for task in tasks:
152
+ local_log_file = anyio.Path(task['local_log_file'])
153
+ # We assume the log directory has the following layout:
154
+ # task-id/
155
+ # - run.log
156
+ # - tasks/
157
+ # - run.log
158
+ # and also remove the tasks directory on cleanup.
159
+ task_log_dir = local_log_file.parent.joinpath('tasks')
160
+ await local_log_file.unlink(missing_ok=True)
161
+ await context_utils.to_thread(shutil.rmtree,
162
+ str(task_log_dir),
163
+ ignore_errors=True)
164
+ # We have at least once semantic guarantee for the cleanup here.
165
+ tasks_to_update.append((task['job_id'], task['task_id']))
166
+ await managed_job_state.set_task_logs_cleaned_async(
167
+ tasks=list(tasks_to_update), logs_cleaned_at=time.time())
168
+ complete = len(tasks) < batch_size
169
+ logger.info(f'Cleaned {len(tasks)} task logs with retention '
170
+ f'{retention_seconds} seconds, complete: {complete}')
171
+ return complete
172
+
173
+
174
+ @context.contextual_async
175
+ async def run_log_gc():
176
+ """Run the log garbage collector."""
177
+ log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
178
+ os.makedirs(log_dir, exist_ok=True)
179
+ log_path = os.path.join(log_dir, 'garbage_collector.log')
180
+ # Remove previous log file
181
+ await anyio.Path(log_path).unlink(missing_ok=True)
182
+ ctx = context.get()
183
+ assert ctx is not None, 'Context is not initialized'
184
+ ctx.redirect_log(pathlib.Path(log_path))
185
+ gc_controller_logs_for_job_task = asyncio.create_task(
186
+ gc_controller_logs_for_job())
187
+ gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
188
+ await asyncio.gather(gc_controller_logs_for_job_task,
189
+ gc_task_logs_for_job_task)
190
+
191
+
192
+ def elect_for_log_gc():
193
+ """Use filelock to elect for the log garbage collector.
194
+
195
+ The log garbage collector runs in the controller process to avoid the
196
+ overhead of launching a new process and the lifecycle management, the
197
+ threads that does not elected as the log garbage collector just wait.
198
+ on the filelock and bring trivial overhead.
199
+ """
200
+ with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
201
+ asyncio.run(run_log_gc())
@@ -70,7 +70,6 @@ class StrategyExecutor:
70
70
  max_restarts_on_errors: int,
71
71
  job_id: int,
72
72
  task_id: int,
73
- job_logger: logging.Logger,
74
73
  pool: Optional[str],
75
74
  starting: Set[int],
76
75
  starting_lock: asyncio.Lock,
@@ -85,7 +84,6 @@ class StrategyExecutor:
85
84
  max_restarts_on_errors: Maximum number of restarts on errors.
86
85
  job_id: The ID of the job.
87
86
  task_id: The ID of the task.
88
- job_logger: Logger instance for this specific job.
89
87
  starting: Set of job IDs that are currently starting.
90
88
  starting_lock: Lock to synchronize starting jobs.
91
89
  starting_signal: Condition to signal when a job can start.
@@ -105,7 +103,6 @@ class StrategyExecutor:
105
103
  self.task_id = task_id
106
104
  self.pool = pool
107
105
  self.restart_cnt_on_failure = 0
108
- self._logger = job_logger
109
106
  self.job_id_on_pool_cluster: Optional[int] = None
110
107
  self.starting = starting
111
108
  self.starting_lock = starting_lock
@@ -119,7 +116,6 @@ class StrategyExecutor:
119
116
  task: 'task_lib.Task',
120
117
  job_id: int,
121
118
  task_id: int,
122
- job_logger: logging.Logger,
123
119
  pool: Optional[str],
124
120
  starting: Set[int],
125
121
  starting_lock: asyncio.Lock,
@@ -156,7 +152,7 @@ class StrategyExecutor:
156
152
  assert job_recovery_strategy is not None, job_recovery_name
157
153
  return job_recovery_strategy(cluster_name, backend, task,
158
154
  max_restarts_on_errors, job_id, task_id,
159
- job_logger, pool, starting, starting_lock,
155
+ pool, starting, starting_lock,
160
156
  starting_signal)
161
157
 
162
158
  async def launch(self) -> float:
@@ -224,7 +220,7 @@ class StrategyExecutor:
224
220
  **kwargs,
225
221
  _try_cancel_if_cluster_is_init=True,
226
222
  )
227
- self._logger.debug(f'sdk.cancel request ID: {request_id}')
223
+ logger.debug(f'sdk.cancel request ID: {request_id}')
228
224
  await context_utils.to_thread(
229
225
  sdk.get,
230
226
  request_id,
@@ -261,16 +257,15 @@ class StrategyExecutor:
261
257
  # loop.
262
258
  # TODO(zhwu): log the unexpected error to usage collection
263
259
  # for future debugging.
264
- self._logger.info(
265
- f'Unexpected exception: {e}\nFailed to get the '
266
- 'refresh the cluster status. Retrying.')
260
+ logger.info(f'Unexpected exception: {e}\nFailed to get the '
261
+ 'refresh the cluster status. Retrying.')
267
262
  continue
268
263
  if cluster_status != status_lib.ClusterStatus.UP:
269
264
  # The cluster can be preempted before the job is
270
265
  # launched.
271
266
  # Break to let the retry launch kick in.
272
- self._logger.info('The cluster is preempted before the job '
273
- 'is submitted.')
267
+ logger.info('The cluster is preempted before the job '
268
+ 'is submitted.')
274
269
  # TODO(zhwu): we should recover the preemption with the
275
270
  # recovery strategy instead of the current while loop.
276
271
  break
@@ -279,7 +274,6 @@ class StrategyExecutor:
279
274
  status = await managed_job_utils.get_job_status(
280
275
  self.backend,
281
276
  self.cluster_name,
282
- job_logger=self._logger,
283
277
  job_id=self.job_id_on_pool_cluster)
284
278
  except Exception as e: # pylint: disable=broad-except
285
279
  # If any unexpected error happens, retry the job checking
@@ -288,9 +282,8 @@ class StrategyExecutor:
288
282
  # get_job_status, so it should not happen here.
289
283
  # TODO(zhwu): log the unexpected error to usage collection
290
284
  # for future debugging.
291
- self._logger.info(
292
- f'Unexpected exception: {e}\nFailed to get the '
293
- 'job status. Retrying.')
285
+ logger.info(f'Unexpected exception: {e}\nFailed to get the '
286
+ 'job status. Retrying.')
294
287
  continue
295
288
 
296
289
  # Check the job status until it is not in initialized status
@@ -306,9 +299,8 @@ class StrategyExecutor:
306
299
  except Exception as e: # pylint: disable=broad-except
307
300
  # If we failed to get the job timestamp, we will retry
308
301
  # job checking loop.
309
- self._logger.info(
310
- f'Unexpected Exception: {e}\nFailed to get '
311
- 'the job start timestamp. Retrying.')
302
+ logger.info(f'Unexpected Exception: {e}\nFailed to get '
303
+ 'the job start timestamp. Retrying.')
312
304
  continue
313
305
  # Wait for the job to be started
314
306
  await asyncio.sleep(
@@ -370,7 +362,6 @@ class StrategyExecutor:
370
362
  self.starting,
371
363
  self.starting_lock,
372
364
  self.starting_signal,
373
- self._logger,
374
365
  ):
375
366
  # The job state may have been PENDING during backoff -
376
367
  # update to STARTING or RECOVERING.
@@ -394,21 +385,19 @@ class StrategyExecutor:
394
385
  for env_var in ENV_VARS_TO_CLEAR:
395
386
  vars_to_restore[env_var] = os.environ.pop(
396
387
  env_var, None)
397
- self._logger.debug('Cleared env var: '
398
- f'{env_var}')
399
- self._logger.debug('Env vars for api_start: '
400
- f'{os.environ}')
388
+ logger.debug('Cleared env var: '
389
+ f'{env_var}')
390
+ logger.debug('Env vars for api_start: '
391
+ f'{os.environ}')
401
392
  await context_utils.to_thread(sdk.api_start)
402
- self._logger.info('API server started.')
393
+ logger.info('API server started.')
403
394
  finally:
404
395
  for env_var, value in vars_to_restore.items():
405
396
  if value is not None:
406
- self._logger.debug(
407
- 'Restored env var: '
408
- f'{env_var}: {value}')
397
+ logger.debug('Restored env var: '
398
+ f'{env_var}: {value}')
409
399
  os.environ[env_var] = value
410
400
 
411
- log_file = _get_logger_file(self._logger)
412
401
  request_id = None
413
402
  try:
414
403
  request_id = await context_utils.to_thread(
@@ -429,31 +418,27 @@ class StrategyExecutor:
429
418
  # down=True,
430
419
  _is_launched_by_jobs_controller=True,
431
420
  )
432
- self._logger.debug('sdk.launch request ID: '
433
- f'{request_id}')
434
- if log_file is None:
435
- raise OSError('Log file is None')
436
- with open(log_file, 'a', encoding='utf-8') as f:
437
- await context_utils.to_thread(
438
- sdk.stream_and_get,
439
- request_id,
440
- output_stream=f,
441
- )
421
+ logger.debug('sdk.launch request ID: '
422
+ f'{request_id}')
423
+ await context_utils.to_thread(
424
+ sdk.stream_and_get,
425
+ request_id,
426
+ )
442
427
  except asyncio.CancelledError:
443
428
  if request_id:
444
429
  req = await context_utils.to_thread(
445
430
  sdk.api_cancel, request_id)
446
- self._logger.debug('sdk.api_cancel request '
447
- f'ID: {req}')
431
+ logger.debug('sdk.api_cancel request '
432
+ f'ID: {req}')
448
433
  try:
449
434
  await context_utils.to_thread(
450
435
  sdk.get, req)
451
436
  except Exception as e: # pylint: disable=broad-except
452
437
  # we must still return a CancelledError
453
- self._logger.error(
438
+ logger.error(
454
439
  f'Failed to cancel the job: {e}')
455
440
  raise
456
- self._logger.info('Managed job cluster launched.')
441
+ logger.info('Managed job cluster launched.')
457
442
  else:
458
443
  self.cluster_name = await (context_utils.to_thread(
459
444
  serve_utils.get_next_cluster_name, self.pool,
@@ -468,8 +453,8 @@ class StrategyExecutor:
468
453
  self.dag,
469
454
  cluster_name=self.cluster_name,
470
455
  )
471
- self._logger.debug('sdk.exec request ID: '
472
- f'{request_id}')
456
+ logger.debug('sdk.exec request ID: '
457
+ f'{request_id}')
473
458
  job_id_on_pool_cluster, _ = (
474
459
  await context_utils.to_thread(
475
460
  sdk.get, request_id))
@@ -477,14 +462,14 @@ class StrategyExecutor:
477
462
  if request_id:
478
463
  req = await context_utils.to_thread(
479
464
  sdk.api_cancel, request_id)
480
- self._logger.debug('sdk.api_cancel request '
481
- f'ID: {req}')
465
+ logger.debug('sdk.api_cancel request '
466
+ f'ID: {req}')
482
467
  try:
483
468
  await context_utils.to_thread(
484
469
  sdk.get, req)
485
470
  except Exception as e: # pylint: disable=broad-except
486
471
  # we must still return a CancelledError
487
- self._logger.error(
472
+ logger.error(
488
473
  f'Failed to cancel the job: {e}')
489
474
  raise
490
475
  assert job_id_on_pool_cluster is not None, (
@@ -492,15 +477,14 @@ class StrategyExecutor:
492
477
  self.job_id_on_pool_cluster = job_id_on_pool_cluster
493
478
  await state.set_job_id_on_pool_cluster_async(
494
479
  self.job_id, job_id_on_pool_cluster)
495
- self._logger.info('Managed job cluster launched.')
480
+ logger.info('Managed job cluster launched.')
496
481
  except (exceptions.InvalidClusterNameError,
497
482
  exceptions.NoCloudAccessError,
498
483
  exceptions.ResourcesMismatchError,
499
484
  exceptions.StorageSpecError,
500
485
  exceptions.StorageError) as e:
501
- self._logger.error(
502
- 'Failure happened before provisioning. '
503
- f'{common_utils.format_exception(e)}')
486
+ logger.error('Failure happened before provisioning. '
487
+ f'{common_utils.format_exception(e)}')
504
488
  if raise_on_failure:
505
489
  raise exceptions.ProvisionPrechecksError(
506
490
  reasons=[e])
@@ -528,24 +512,22 @@ class StrategyExecutor:
528
512
  reasons_str = '; '.join(
529
513
  common_utils.format_exception(err)
530
514
  for err in reasons)
531
- self._logger.error(
515
+ logger.error(
532
516
  'Failure happened before provisioning. '
533
517
  f'Failover reasons: {reasons_str}')
534
518
  if raise_on_failure:
535
519
  raise exceptions.ProvisionPrechecksError(
536
520
  reasons)
537
521
  return None
538
- self._logger.info(
539
- 'Failed to launch a cluster with error: '
540
- f'{common_utils.format_exception(e)})')
522
+ logger.info('Failed to launch a cluster with error: '
523
+ f'{common_utils.format_exception(e)})')
541
524
  except Exception as e: # pylint: disable=broad-except
542
525
  # If the launch fails, it will be recovered by the
543
526
  # following code.
544
- self._logger.info(
545
- 'Failed to launch a cluster with error: '
546
- f'{common_utils.format_exception(e)})')
527
+ logger.info('Failed to launch a cluster with error: '
528
+ f'{common_utils.format_exception(e)})')
547
529
  with ux_utils.enable_traceback():
548
- self._logger.info(
530
+ logger.info(
549
531
  f' Traceback: {traceback.format_exc()}')
550
532
  else: # No exception, the launch succeeds.
551
533
  # At this point, a sky.launch() has succeeded. Cluster
@@ -559,7 +541,7 @@ class StrategyExecutor:
559
541
  # launch.
560
542
  # TODO(zhwu): log the unexpected error to usage
561
543
  # collection for future debugging.
562
- self._logger.info(
544
+ logger.info(
563
545
  'Failed to successfully submit the job to the '
564
546
  'launched cluster, due to unexpected submission '
565
547
  'errors or the cluster being preempted during '
@@ -594,8 +576,8 @@ class StrategyExecutor:
594
576
  # Calculate the backoff time and sleep.
595
577
  gap_seconds = (backoff.current_backoff()
596
578
  if self.pool is None else 1)
597
- self._logger.info('Retrying to launch the cluster in '
598
- f'{gap_seconds:.1f} seconds.')
579
+ logger.info('Retrying to launch the cluster in '
580
+ f'{gap_seconds:.1f} seconds.')
599
581
  await asyncio.sleep(gap_seconds)
600
582
  continue
601
583
  else:
@@ -630,15 +612,14 @@ class FailoverStrategyExecutor(StrategyExecutor):
630
612
  max_restarts_on_errors: int,
631
613
  job_id: int,
632
614
  task_id: int,
633
- job_logger: logging.Logger,
634
615
  pool: Optional[str],
635
616
  starting: Set[int],
636
617
  starting_lock: asyncio.Lock,
637
618
  starting_signal: asyncio.Condition,
638
619
  ) -> None:
639
620
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
640
- job_id, task_id, job_logger, pool, starting,
641
- starting_lock, starting_signal)
621
+ job_id, task_id, pool, starting, starting_lock,
622
+ starting_signal)
642
623
  # Note down the cloud/region of the launched cluster, so that we can
643
624
  # first retry in the same cloud/region. (Inside recover() we may not
644
625
  # rely on cluster handle, as it can be None if the cluster is
@@ -694,14 +675,13 @@ class FailoverStrategyExecutor(StrategyExecutor):
694
675
  return job_submitted_at
695
676
 
696
677
  # Step 2
697
- self._logger.debug('Terminating unhealthy cluster and reset cloud '
698
- 'region.')
678
+ logger.debug('Terminating unhealthy cluster and reset cloud '
679
+ 'region.')
699
680
  await context_utils.to_thread(self._cleanup_cluster)
700
681
 
701
682
  # Step 3
702
- self._logger.debug(
703
- 'Relaunch the cluster without constraining to prior '
704
- 'cloud/region.')
683
+ logger.debug('Relaunch the cluster without constraining to prior '
684
+ 'cloud/region.')
705
685
  # Not using self.launch to avoid the retry until up logic.
706
686
  job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
707
687
  raise_on_failure=False,
@@ -709,8 +689,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
709
689
  if job_submitted_at is None:
710
690
  # Failed to launch the cluster.
711
691
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
712
- self._logger.info('Retrying to recover the cluster in '
713
- f'{gap_seconds:.1f} seconds.')
692
+ logger.info('Retrying to recover the cluster in '
693
+ f'{gap_seconds:.1f} seconds.')
714
694
  await asyncio.sleep(gap_seconds)
715
695
  continue
716
696
 
@@ -755,14 +735,12 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
755
735
  # task.resources.
756
736
 
757
737
  # Step 1
758
- self._logger.debug(
759
- 'Terminating unhealthy cluster and reset cloud region.')
738
+ logger.debug('Terminating unhealthy cluster and reset cloud region.')
760
739
  await context_utils.to_thread(self._cleanup_cluster)
761
740
 
762
741
  # Step 2
763
- self._logger.debug(
764
- 'Relaunch the cluster skipping the previously launched '
765
- 'cloud/region.')
742
+ logger.debug('Relaunch the cluster skipping the previously launched '
743
+ 'cloud/region.')
766
744
  if self._launched_resources is not None:
767
745
  task = self.dag.tasks[0]
768
746
  requested_resources = self._launched_resources
@@ -787,9 +765,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
787
765
 
788
766
  while True:
789
767
  # Step 3
790
- self._logger.debug(
791
- 'Relaunch the cluster without constraining to prior '
792
- 'cloud/region.')
768
+ logger.debug('Relaunch the cluster without constraining to prior '
769
+ 'cloud/region.')
793
770
  # Not using self.launch to avoid the retry until up logic.
794
771
  job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
795
772
  raise_on_failure=False,
@@ -797,8 +774,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
797
774
  if job_submitted_at is None:
798
775
  # Failed to launch the cluster.
799
776
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
800
- self._logger.info('Retrying to recover the cluster in '
801
- f'{gap_seconds:.1f} seconds.')
777
+ logger.info('Retrying to recover the cluster in '
778
+ f'{gap_seconds:.1f} seconds.')
802
779
  await asyncio.sleep(gap_seconds)
803
780
  continue
804
781