skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -6,13 +6,18 @@ https://github.com/ray-project/ray/tree/master/dashboard/client/src) and/or get
6
6
  rid of the SSH port-forwarding business (see cli.py's job_dashboard()
7
7
  comment).
8
8
  """
9
+ import collections
9
10
  import datetime
11
+ import enum
12
+ import os
10
13
  import pathlib
11
14
 
12
15
  import flask
13
16
  import yaml
14
17
 
15
18
  from sky import jobs as managed_jobs
19
+ from sky.client import sdk
20
+ from sky.jobs import constants as managed_job_constants
16
21
  from sky.utils import common_utils
17
22
  from sky.utils import controller_utils
18
23
 
@@ -26,25 +31,112 @@ def _is_running_on_jobs_controller() -> bool:
26
31
  """
27
32
  if pathlib.Path('~/.sky/sky_ray.yml').expanduser().exists():
28
33
  config = yaml.safe_load(
29
- pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text())
34
+ pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text(
35
+ encoding='utf-8'))
30
36
  cluster_name = config.get('cluster_name', '')
31
- candidate_controller_names = (
32
- controller_utils.Controllers.JOBS_CONTROLLER.value.
33
- candidate_cluster_names)
34
37
  # We use startswith instead of exact match because the cluster name in
35
38
  # the yaml file is cluster_name_on_cloud which may have additional
36
39
  # suffices.
37
- return any(
38
- cluster_name.startswith(name)
39
- for name in candidate_controller_names)
40
+ return cluster_name.startswith(
41
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
40
42
  return False
41
43
 
42
44
 
45
+ # Column indices for job table
46
+ class JobTableColumns(enum.IntEnum):
47
+ """Column indices for the jobs table in the dashboard.
48
+
49
+ - DROPDOWN (0): Column for expandable dropdown arrow
50
+ - ID (1): Job ID column
51
+ - TASK (2): Task name/number column
52
+ - NAME (3): Job name column
53
+ - RESOURCES (4): Resources used by job
54
+ - SUBMITTED (5): Job submission timestamp
55
+ - TOTAL_DURATION (6): Total time since job submission
56
+ - JOB_DURATION (7): Actual job runtime
57
+ - RECOVERIES (8): Number of job recoveries
58
+ - STATUS (9): Current job status
59
+ - STARTED (10): Job start timestamp
60
+ - CLUSTER (11): Cluster name
61
+ - REGION (12): Cloud region
62
+ - FAILOVER (13): Job failover history
63
+ - DETAILS (14): Job details
64
+ - ACTIONS (15): Available actions column
65
+ """
66
+ DROPDOWN = 0
67
+ ID = 1
68
+ TASK = 2
69
+ NAME = 3
70
+ RESOURCES = 4
71
+ SUBMITTED = 5
72
+ TOTAL_DURATION = 6
73
+ JOB_DURATION = 7
74
+ RECOVERIES = 8
75
+ STATUS = 9
76
+ STARTED = 10
77
+ CLUSTER = 11
78
+ REGION = 12
79
+ DETAILS = 13
80
+ FAILOVER = 14
81
+ ACTIONS = 15
82
+
83
+
84
+ # Column headers matching the indices above
85
+ JOB_TABLE_COLUMNS = [
86
+ '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
87
+ 'Job Duration', 'Status', 'Started', 'Cluster', 'Region', 'Failover',
88
+ 'Recoveries', 'Details', 'Actions'
89
+ ]
90
+
91
+ # This column is given by format_job_table but should be ignored.
92
+ SCHED_STATE_COLUMN = 12
93
+
94
+
95
+ def _extract_launch_history(log_content: str) -> str:
96
+ """Extract launch history from log content.
97
+
98
+ Args:
99
+ log_content: Content of the log file.
100
+ Returns:
101
+ A formatted string containing the launch history.
102
+ """
103
+ launches = []
104
+ current_launch = None
105
+
106
+ for line in log_content.splitlines():
107
+ if 'Launching on' in line:
108
+ try:
109
+ parts = line.split(']')
110
+ if len(parts) >= 2:
111
+ timestamp = parts[0].split()[1:3]
112
+ message = parts[1].replace('[0m⚙︎', '').strip()
113
+ formatted_line = f'{" ".join(timestamp)} {message}'
114
+ if current_launch:
115
+ prev_time, prev_target = current_launch.rsplit(
116
+ ' Launching on ', 1)
117
+ launches.append(
118
+ f'{prev_time} Tried to launch on {prev_target}')
119
+
120
+ # Store the current launch
121
+ current_launch = formatted_line
122
+ except IndexError:
123
+ launches.append(line.strip())
124
+
125
+ # Add the final (successful) launch at the beginning
126
+ if current_launch:
127
+ result = [current_launch]
128
+ result.extend(launches)
129
+ return '\n'.join(result)
130
+
131
+ return 'No launch history found'
132
+
133
+
43
134
  @app.route('/')
44
135
  def home():
45
136
  if not _is_running_on_jobs_controller():
46
137
  # Experimental: run on laptop (refresh is very slow).
47
- all_managed_jobs = managed_jobs.queue(refresh=True, skip_finished=False)
138
+ request_id = managed_jobs.queue(refresh=True, skip_finished=False)
139
+ all_managed_jobs = sdk.get(request_id)
48
140
  else:
49
141
  job_table = managed_jobs.dump_managed_job_queue()
50
142
  all_managed_jobs = managed_jobs.load_managed_job_queue(job_table)
@@ -52,36 +144,90 @@ def home():
52
144
  timestamp = datetime.datetime.now(datetime.timezone.utc)
53
145
  rows = managed_jobs.format_job_table(all_managed_jobs,
54
146
  show_all=True,
147
+ show_user=False,
55
148
  return_rows=True)
56
- # Add an empty column for the dropdown button. This will be added in the
57
- # jobs/templates/index.html file.
58
- rows = [[''] + row for row in rows]
59
-
60
- # FIXME(zongheng): make the job table/queue funcs return structured info so
61
- # that we don't have to do things like row[-5] below.
62
- columns = [
63
- '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
64
- 'Job Duration', 'Recoveries', 'Status', 'Started', 'Cluster', 'Region',
65
- 'Failure'
149
+
150
+ status_counts = collections.defaultdict(int)
151
+ for task in all_managed_jobs:
152
+ if not task['status'].is_terminal():
153
+ status_counts[task['status'].value] += 1
154
+
155
+ # Add an empty column for the dropdown button and actions column
156
+ # Exclude SCHED. STATE column
157
+ rows = [
158
+ [''] + row[:SCHED_STATE_COLUMN] + row[SCHED_STATE_COLUMN + 1:] +
159
+ # Add empty cell for failover and actions column
160
+ [''] + [''] for row in rows
66
161
  ]
67
- if rows and len(rows[0]) != len(columns):
162
+
163
+ # Add log content as failover history for each job
164
+ for row in rows:
165
+ job_id = str(row[JobTableColumns.ID]).strip().replace(' ⤳', '')
166
+ if job_id and job_id != '-':
167
+ try:
168
+ log_path = os.path.join(
169
+ os.path.expanduser(
170
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
171
+ f'{job_id}.log')
172
+ if os.path.exists(log_path):
173
+ with open(log_path, 'r', encoding='utf-8') as f:
174
+ log_content = f.read()
175
+ row[JobTableColumns.FAILOVER] = _extract_launch_history(
176
+ log_content)
177
+ else:
178
+ row[JobTableColumns.FAILOVER] = 'Log file not found'
179
+ except (IOError, OSError) as e:
180
+ row[JobTableColumns.FAILOVER] = f'Error reading log: {str(e)}'
181
+ app.logger.error('All managed jobs:')
182
+
183
+ # Validate column count
184
+ if rows and len(rows[0]) != len(JOB_TABLE_COLUMNS):
68
185
  raise RuntimeError(
69
- 'Dashboard code and managed job queue code are out of sync.')
186
+ f'Dashboard code and managed job queue code are out of sync. '
187
+ f'Expected {(JOB_TABLE_COLUMNS)} columns, got {(rows[0])}')
70
188
 
71
- # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'.
189
+ # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'
72
190
  for row in rows:
73
- row[-5] = common_utils.remove_color(row[-5])
74
- # Remove filler rows ([''], ..., ['-']).
75
- rows = [row for row in rows if ''.join(map(str, row)) != '']
191
+ row[JobTableColumns.STATUS] = common_utils.remove_color(
192
+ row[JobTableColumns.STATUS])
193
+
194
+ # Remove filler rows ([''], ..., ['-'])
195
+ rows = [
196
+ row for row in rows
197
+ if ''.join(map(str, row[:JobTableColumns.ACTIONS])) != ''
198
+ ]
199
+
200
+ # Get all unique status values
201
+ status_values = sorted(
202
+ list(set(row[JobTableColumns.STATUS] for row in rows)))
76
203
 
77
204
  rendered_html = flask.render_template(
78
205
  'index.html',
79
- columns=columns,
206
+ columns=JOB_TABLE_COLUMNS,
80
207
  rows=rows,
81
208
  last_updated_timestamp=timestamp,
209
+ status_values=status_values,
210
+ status_counts=status_counts,
82
211
  )
83
212
  return rendered_html
84
213
 
85
214
 
215
+ @app.route('/download_log/<job_id>')
216
+ def download_log(job_id):
217
+ try:
218
+ log_path = os.path.join(
219
+ os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
220
+ f'{job_id}.log')
221
+ if not os.path.exists(log_path):
222
+ flask.abort(404)
223
+ return flask.send_file(log_path,
224
+ mimetype='text/plain',
225
+ as_attachment=True,
226
+ download_name=f'job_{job_id}.log')
227
+ except (IOError, OSError) as e:
228
+ app.logger.error(f'Error downloading log for job {job_id}: {str(e)}')
229
+ flask.abort(500)
230
+
231
+
86
232
  if __name__ == '__main__':
87
233
  app.run()