skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
|
@@ -10,8 +10,7 @@ import sqlite3
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import
|
|
14
|
-
Union)
|
|
13
|
+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
|
|
15
14
|
import urllib.parse
|
|
16
15
|
|
|
17
16
|
import colorama
|
|
@@ -94,6 +93,7 @@ spot_table = sqlalchemy.Table(
|
|
|
94
93
|
sqlalchemy.Column('specs', sqlalchemy.Text),
|
|
95
94
|
sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
|
|
96
95
|
sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
|
|
96
|
+
sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
|
|
97
97
|
)
|
|
98
98
|
|
|
99
99
|
job_info_table = sqlalchemy.Table(
|
|
@@ -109,6 +109,8 @@ job_info_table = sqlalchemy.Table(
|
|
|
109
109
|
server_default=None),
|
|
110
110
|
sqlalchemy.Column('dag_yaml_path', sqlalchemy.Text),
|
|
111
111
|
sqlalchemy.Column('env_file_path', sqlalchemy.Text),
|
|
112
|
+
sqlalchemy.Column('dag_yaml_content', sqlalchemy.Text, server_default=None),
|
|
113
|
+
sqlalchemy.Column('env_file_content', sqlalchemy.Text, server_default=None),
|
|
112
114
|
sqlalchemy.Column('user_hash', sqlalchemy.Text),
|
|
113
115
|
sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
|
|
114
116
|
sqlalchemy.Column('priority',
|
|
@@ -118,6 +120,9 @@ job_info_table = sqlalchemy.Table(
|
|
|
118
120
|
sqlalchemy.Column('original_user_yaml_path',
|
|
119
121
|
sqlalchemy.Text,
|
|
120
122
|
server_default=None),
|
|
123
|
+
sqlalchemy.Column('original_user_yaml_content',
|
|
124
|
+
sqlalchemy.Text,
|
|
125
|
+
server_default=None),
|
|
121
126
|
sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
|
|
122
127
|
sqlalchemy.Column('current_cluster_name',
|
|
123
128
|
sqlalchemy.Text,
|
|
@@ -126,6 +131,9 @@ job_info_table = sqlalchemy.Table(
|
|
|
126
131
|
sqlalchemy.Integer,
|
|
127
132
|
server_default=None),
|
|
128
133
|
sqlalchemy.Column('pool_hash', sqlalchemy.Text, server_default=None),
|
|
134
|
+
sqlalchemy.Column('controller_logs_cleaned_at',
|
|
135
|
+
sqlalchemy.Float,
|
|
136
|
+
server_default=None),
|
|
129
137
|
)
|
|
130
138
|
|
|
131
139
|
ha_recovery_script_table = sqlalchemy.Table(
|
|
@@ -280,6 +288,27 @@ def _init_db(func):
|
|
|
280
288
|
return wrapper
|
|
281
289
|
|
|
282
290
|
|
|
291
|
+
async def _describe_task_transition_failure(session: sql_async.AsyncSession,
|
|
292
|
+
job_id: int, task_id: int) -> str:
|
|
293
|
+
"""Return a human-readable description when a task transition fails."""
|
|
294
|
+
details = 'Couldn\'t fetch the task details.'
|
|
295
|
+
try:
|
|
296
|
+
debug_result = await session.execute(
|
|
297
|
+
sqlalchemy.select(spot_table.c.status, spot_table.c.end_at).where(
|
|
298
|
+
sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
|
|
299
|
+
spot_table.c.task_id == task_id)))
|
|
300
|
+
rows = debug_result.mappings().all()
|
|
301
|
+
details = (f'{len(rows)} rows matched job {job_id} and task '
|
|
302
|
+
f'{task_id}.')
|
|
303
|
+
for row in rows:
|
|
304
|
+
status = row['status']
|
|
305
|
+
end_at = row['end_at']
|
|
306
|
+
details += f' Status: {status}, End time: {end_at}.'
|
|
307
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
308
|
+
details += f' Error fetching task details: {exc}'
|
|
309
|
+
return details
|
|
310
|
+
|
|
311
|
+
|
|
283
312
|
# job_duration is the time a job actually runs (including the
|
|
284
313
|
# setup duration) before last_recover, excluding the provision
|
|
285
314
|
# and recovery time.
|
|
@@ -293,42 +322,50 @@ def _init_db(func):
|
|
|
293
322
|
# column names in the DB and it corresponds to the combined view
|
|
294
323
|
# by joining the spot and job_info tables.
|
|
295
324
|
def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
325
|
+
# WARNING: If you update these you may also need to update GetJobTable in
|
|
326
|
+
# the skylet ManagedJobsServiceImpl.
|
|
296
327
|
return {
|
|
297
|
-
'_job_id': r
|
|
298
|
-
'_task_name': r
|
|
299
|
-
'resources': r
|
|
300
|
-
'submitted_at': r
|
|
301
|
-
'status': r
|
|
302
|
-
'run_timestamp': r
|
|
303
|
-
'start_at': r
|
|
304
|
-
'end_at': r
|
|
305
|
-
'last_recovered_at': r
|
|
306
|
-
'recovery_count': r
|
|
307
|
-
'job_duration': r
|
|
308
|
-
'failure_reason': r
|
|
309
|
-
'job_id': r
|
|
310
|
-
|
|
311
|
-
'
|
|
312
|
-
'
|
|
313
|
-
'
|
|
314
|
-
'
|
|
328
|
+
'_job_id': r.get('job_id'), # from spot table
|
|
329
|
+
'_task_name': r.get('job_name'), # deprecated, from spot table
|
|
330
|
+
'resources': r.get('resources'),
|
|
331
|
+
'submitted_at': r.get('submitted_at'),
|
|
332
|
+
'status': r.get('status'),
|
|
333
|
+
'run_timestamp': r.get('run_timestamp'),
|
|
334
|
+
'start_at': r.get('start_at'),
|
|
335
|
+
'end_at': r.get('end_at'),
|
|
336
|
+
'last_recovered_at': r.get('last_recovered_at'),
|
|
337
|
+
'recovery_count': r.get('recovery_count'),
|
|
338
|
+
'job_duration': r.get('job_duration'),
|
|
339
|
+
'failure_reason': r.get('failure_reason'),
|
|
340
|
+
'job_id': r.get(spot_table.c.spot_job_id
|
|
341
|
+
), # ambiguous, use table.column
|
|
342
|
+
'task_id': r.get('task_id'),
|
|
343
|
+
'task_name': r.get('task_name'),
|
|
344
|
+
'specs': r.get('specs'),
|
|
345
|
+
'local_log_file': r.get('local_log_file'),
|
|
346
|
+
'metadata': r.get('metadata'),
|
|
315
347
|
# columns from job_info table (some may be None for legacy jobs)
|
|
316
|
-
'_job_info_job_id': r
|
|
317
|
-
|
|
318
|
-
'job_name': r
|
|
319
|
-
'schedule_state': r
|
|
320
|
-
'controller_pid': r
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
'
|
|
324
|
-
'
|
|
325
|
-
'
|
|
326
|
-
'
|
|
327
|
-
'
|
|
328
|
-
'
|
|
329
|
-
'
|
|
330
|
-
'
|
|
331
|
-
'
|
|
348
|
+
'_job_info_job_id': r.get(job_info_table.c.spot_job_id
|
|
349
|
+
), # ambiguous, use table.column
|
|
350
|
+
'job_name': r.get('name'), # from job_info table
|
|
351
|
+
'schedule_state': r.get('schedule_state'),
|
|
352
|
+
'controller_pid': r.get('controller_pid'),
|
|
353
|
+
# the _path columns are for backwards compatibility, use the _content
|
|
354
|
+
# columns instead
|
|
355
|
+
'dag_yaml_path': r.get('dag_yaml_path'),
|
|
356
|
+
'env_file_path': r.get('env_file_path'),
|
|
357
|
+
'dag_yaml_content': r.get('dag_yaml_content'),
|
|
358
|
+
'env_file_content': r.get('env_file_content'),
|
|
359
|
+
'user_hash': r.get('user_hash'),
|
|
360
|
+
'workspace': r.get('workspace'),
|
|
361
|
+
'priority': r.get('priority'),
|
|
362
|
+
'entrypoint': r.get('entrypoint'),
|
|
363
|
+
'original_user_yaml_path': r.get('original_user_yaml_path'),
|
|
364
|
+
'original_user_yaml_content': r.get('original_user_yaml_content'),
|
|
365
|
+
'pool': r.get('pool'),
|
|
366
|
+
'current_cluster_name': r.get('current_cluster_name'),
|
|
367
|
+
'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
|
|
368
|
+
'pool_hash': r.get('pool_hash'),
|
|
332
369
|
}
|
|
333
370
|
|
|
334
371
|
|
|
@@ -671,8 +708,8 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
671
708
|
# === Status transition functions ===
|
|
672
709
|
@_init_db
|
|
673
710
|
def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
|
|
674
|
-
pool: Optional[str],
|
|
675
|
-
|
|
711
|
+
pool: Optional[str], pool_hash: Optional[str],
|
|
712
|
+
user_hash: Optional[str]) -> int:
|
|
676
713
|
assert _SQLALCHEMY_ENGINE is not None
|
|
677
714
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
678
715
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -691,6 +728,7 @@ def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
|
|
|
691
728
|
entrypoint=entrypoint,
|
|
692
729
|
pool=pool,
|
|
693
730
|
pool_hash=pool_hash,
|
|
731
|
+
user_hash=user_hash,
|
|
694
732
|
)
|
|
695
733
|
|
|
696
734
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -758,9 +796,12 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
|
|
|
758
796
|
count = result.rowcount
|
|
759
797
|
await session.commit()
|
|
760
798
|
if count != 1:
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
799
|
+
details = await _describe_task_transition_failure(
|
|
800
|
+
session, job_id, task_id)
|
|
801
|
+
message = ('Failed to set the task back to pending. '
|
|
802
|
+
f'({count} rows updated. {details})')
|
|
803
|
+
logger.error(message)
|
|
804
|
+
raise exceptions.ManagedJobStatusError(message)
|
|
764
805
|
# Do not call callback_func here, as we don't use the callback for PENDING.
|
|
765
806
|
|
|
766
807
|
|
|
@@ -789,9 +830,12 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
|
|
|
789
830
|
await session.commit()
|
|
790
831
|
logger.debug(f'back to {target_status}')
|
|
791
832
|
if count != 1:
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
833
|
+
details = await _describe_task_transition_failure(
|
|
834
|
+
session, job_id, task_id)
|
|
835
|
+
message = (f'Failed to set the task back to {target_status}. '
|
|
836
|
+
f'({count} rows updated. {details})')
|
|
837
|
+
logger.error(message)
|
|
838
|
+
raise exceptions.ManagedJobStatusError(message)
|
|
795
839
|
# Do not call callback_func here, as it should only be invoked for the
|
|
796
840
|
# initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
|
|
797
841
|
|
|
@@ -1048,7 +1092,8 @@ def _get_all_task_ids_statuses(
|
|
|
1048
1092
|
|
|
1049
1093
|
@_init_db
|
|
1050
1094
|
def get_all_task_ids_names_statuses_logs(
|
|
1051
|
-
|
|
1095
|
+
job_id: int
|
|
1096
|
+
) -> List[Tuple[int, str, ManagedJobStatus, str, Optional[float]]]:
|
|
1052
1097
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1053
1098
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1054
1099
|
id_names = session.execute(
|
|
@@ -1057,9 +1102,10 @@ def get_all_task_ids_names_statuses_logs(
|
|
|
1057
1102
|
spot_table.c.task_name,
|
|
1058
1103
|
spot_table.c.status,
|
|
1059
1104
|
spot_table.c.local_log_file,
|
|
1105
|
+
spot_table.c.logs_cleaned_at,
|
|
1060
1106
|
).where(spot_table.c.spot_job_id == job_id).order_by(
|
|
1061
1107
|
spot_table.c.task_id.asc())).fetchall()
|
|
1062
|
-
return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
|
|
1108
|
+
return [(row[0], row[1], ManagedJobStatus(row[2]), row[3], row[4])
|
|
1063
1109
|
for row in id_names]
|
|
1064
1110
|
|
|
1065
1111
|
|
|
@@ -1124,8 +1170,8 @@ def get_failure_reason(job_id: int) -> Optional[str]:
|
|
|
1124
1170
|
|
|
1125
1171
|
|
|
1126
1172
|
@_init_db
|
|
1127
|
-
def
|
|
1128
|
-
"""Get managed
|
|
1173
|
+
def get_managed_job_tasks(job_id: int) -> List[Dict[str, Any]]:
|
|
1174
|
+
"""Get managed job tasks for a specific managed job id from the database."""
|
|
1129
1175
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1130
1176
|
|
|
1131
1177
|
# Join spot and job_info tables to get the job name for each task.
|
|
@@ -1140,10 +1186,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
1140
1186
|
spot_table.outerjoin(
|
|
1141
1187
|
job_info_table,
|
|
1142
1188
|
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
|
1146
|
-
spot_table.c.task_id.asc())
|
|
1189
|
+
query = query.where(spot_table.c.spot_job_id == job_id)
|
|
1190
|
+
query = query.order_by(spot_table.c.task_id.asc())
|
|
1147
1191
|
rows = None
|
|
1148
1192
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1149
1193
|
rows = session.execute(query).fetchall()
|
|
@@ -1158,20 +1202,307 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
1158
1202
|
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
|
1159
1203
|
|
|
1160
1204
|
# Add user YAML content for managed jobs.
|
|
1161
|
-
|
|
1162
|
-
if
|
|
1163
|
-
try
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1205
|
+
job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
|
|
1206
|
+
if job_dict['user_yaml'] is None:
|
|
1207
|
+
# Backwards compatibility - try to read from file path
|
|
1208
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
|
1209
|
+
if yaml_path:
|
|
1210
|
+
try:
|
|
1211
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
1212
|
+
job_dict['user_yaml'] = f.read()
|
|
1213
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
1214
|
+
logger.debug('Failed to read original user YAML for job '
|
|
1215
|
+
f'{job_id} from {yaml_path}: {e}')
|
|
1170
1216
|
|
|
1171
1217
|
jobs.append(job_dict)
|
|
1172
1218
|
return jobs
|
|
1173
1219
|
|
|
1174
1220
|
|
|
1221
|
+
def _map_response_field_to_db_column(field: str):
|
|
1222
|
+
"""Map the response field name to an actual SQLAlchemy ColumnElement.
|
|
1223
|
+
|
|
1224
|
+
This ensures we never pass plain strings to SQLAlchemy 2.0 APIs like
|
|
1225
|
+
Select.with_only_columns().
|
|
1226
|
+
"""
|
|
1227
|
+
# Explicit aliases differing from actual DB column names
|
|
1228
|
+
alias_mapping = {
|
|
1229
|
+
'_job_id': spot_table.c.job_id, # spot.job_id
|
|
1230
|
+
'_task_name': spot_table.c.job_name, # deprecated, from spot table
|
|
1231
|
+
'job_id': spot_table.c.spot_job_id, # public job id -> spot.spot_job_id
|
|
1232
|
+
'_job_info_job_id': job_info_table.c.spot_job_id,
|
|
1233
|
+
'job_name': job_info_table.c.name, # public job name -> job_info.name
|
|
1234
|
+
}
|
|
1235
|
+
if field in alias_mapping:
|
|
1236
|
+
return alias_mapping[field]
|
|
1237
|
+
|
|
1238
|
+
# Try direct match on the `spot` table columns
|
|
1239
|
+
if field in spot_table.c:
|
|
1240
|
+
return spot_table.c[field]
|
|
1241
|
+
|
|
1242
|
+
# Try direct match on the `job_info` table columns
|
|
1243
|
+
if field in job_info_table.c:
|
|
1244
|
+
return job_info_table.c[field]
|
|
1245
|
+
|
|
1246
|
+
raise ValueError(f'Unknown field: {field}')
|
|
1247
|
+
|
|
1248
|
+
|
|
1249
|
+
@_init_db
|
|
1250
|
+
def get_managed_jobs_total() -> int:
|
|
1251
|
+
"""Get the total number of managed jobs."""
|
|
1252
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1253
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1254
|
+
result = session.execute(
|
|
1255
|
+
sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
|
|
1256
|
+
).select_from(spot_table)).fetchone()
|
|
1257
|
+
return result[0] if result else 0
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
@_init_db
|
|
1261
|
+
def get_managed_jobs_highest_priority() -> int:
|
|
1262
|
+
"""Get the highest priority of the managed jobs."""
|
|
1263
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1264
|
+
query = sqlalchemy.select(sqlalchemy.func.max(
|
|
1265
|
+
job_info_table.c.priority)).where(
|
|
1266
|
+
sqlalchemy.and_(
|
|
1267
|
+
job_info_table.c.schedule_state.in_([
|
|
1268
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1269
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1270
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1271
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1272
|
+
]),
|
|
1273
|
+
job_info_table.c.priority.is_not(None),
|
|
1274
|
+
))
|
|
1275
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1276
|
+
priority = session.execute(query).fetchone()
|
|
1277
|
+
return priority[0] if priority and priority[
|
|
1278
|
+
0] is not None else constants.MIN_PRIORITY
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
def build_managed_jobs_with_filters_no_status_query(
|
|
1282
|
+
fields: Optional[List[str]] = None,
|
|
1283
|
+
job_ids: Optional[List[int]] = None,
|
|
1284
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1285
|
+
workspace_match: Optional[str] = None,
|
|
1286
|
+
name_match: Optional[str] = None,
|
|
1287
|
+
pool_match: Optional[str] = None,
|
|
1288
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1289
|
+
skip_finished: bool = False,
|
|
1290
|
+
count_only: bool = False,
|
|
1291
|
+
status_count: bool = False,
|
|
1292
|
+
) -> sqlalchemy.Select:
|
|
1293
|
+
"""Build a query to get managed jobs from the database with filters."""
|
|
1294
|
+
# Join spot and job_info tables to get the job name for each task.
|
|
1295
|
+
# We use LEFT OUTER JOIN mainly for backward compatibility, as for an
|
|
1296
|
+
# existing controller before #1982, the job_info table may not exist,
|
|
1297
|
+
# and all the managed jobs created before will not present in the
|
|
1298
|
+
# job_info.
|
|
1299
|
+
# Note: we will get the user_hash here, but don't try to call
|
|
1300
|
+
# global_user_state.get_user() on it. This runs on the controller, which may
|
|
1301
|
+
# not have the user info. Prefer to do it on the API server side.
|
|
1302
|
+
if count_only:
|
|
1303
|
+
query = sqlalchemy.select(sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
|
|
1304
|
+
elif status_count:
|
|
1305
|
+
query = sqlalchemy.select(spot_table.c.status,
|
|
1306
|
+
sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
|
|
1307
|
+
else:
|
|
1308
|
+
query = sqlalchemy.select(spot_table, job_info_table)
|
|
1309
|
+
query = query.select_from(
|
|
1310
|
+
spot_table.outerjoin(
|
|
1311
|
+
job_info_table,
|
|
1312
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1313
|
+
if skip_finished:
|
|
1314
|
+
# Filter out finished jobs at the DB level. If a multi-task job is
|
|
1315
|
+
# partially finished, include all its tasks. We do this by first
|
|
1316
|
+
# selecting job_ids that have at least one non-terminal task, then
|
|
1317
|
+
# restricting the main query to those job_ids.
|
|
1318
|
+
terminal_status_values = [
|
|
1319
|
+
s.value for s in ManagedJobStatus.terminal_statuses()
|
|
1320
|
+
]
|
|
1321
|
+
non_terminal_job_ids_subquery = (sqlalchemy.select(
|
|
1322
|
+
spot_table.c.spot_job_id).where(
|
|
1323
|
+
sqlalchemy.or_(
|
|
1324
|
+
spot_table.c.status.is_(None),
|
|
1325
|
+
sqlalchemy.not_(
|
|
1326
|
+
spot_table.c.status.in_(terminal_status_values)),
|
|
1327
|
+
)).distinct())
|
|
1328
|
+
query = query.where(
|
|
1329
|
+
spot_table.c.spot_job_id.in_(non_terminal_job_ids_subquery))
|
|
1330
|
+
if not count_only and not status_count and fields:
|
|
1331
|
+
# Resolve requested field names to explicit ColumnElements from
|
|
1332
|
+
# the joined tables.
|
|
1333
|
+
selected_columns = [_map_response_field_to_db_column(f) for f in fields]
|
|
1334
|
+
query = query.with_only_columns(*selected_columns)
|
|
1335
|
+
if job_ids is not None:
|
|
1336
|
+
query = query.where(spot_table.c.spot_job_id.in_(job_ids))
|
|
1337
|
+
if accessible_workspaces is not None:
|
|
1338
|
+
query = query.where(
|
|
1339
|
+
job_info_table.c.workspace.in_(accessible_workspaces))
|
|
1340
|
+
if workspace_match is not None:
|
|
1341
|
+
query = query.where(
|
|
1342
|
+
job_info_table.c.workspace.like(f'%{workspace_match}%'))
|
|
1343
|
+
if name_match is not None:
|
|
1344
|
+
query = query.where(job_info_table.c.name.like(f'%{name_match}%'))
|
|
1345
|
+
if pool_match is not None:
|
|
1346
|
+
query = query.where(job_info_table.c.pool.like(f'%{pool_match}%'))
|
|
1347
|
+
if user_hashes is not None:
|
|
1348
|
+
query = query.where(job_info_table.c.user_hash.in_(user_hashes))
|
|
1349
|
+
return query
|
|
1350
|
+
|
|
1351
|
+
|
|
1352
|
+
def build_managed_jobs_with_filters_query(
|
|
1353
|
+
fields: Optional[List[str]] = None,
|
|
1354
|
+
job_ids: Optional[List[int]] = None,
|
|
1355
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1356
|
+
workspace_match: Optional[str] = None,
|
|
1357
|
+
name_match: Optional[str] = None,
|
|
1358
|
+
pool_match: Optional[str] = None,
|
|
1359
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1360
|
+
statuses: Optional[List[str]] = None,
|
|
1361
|
+
skip_finished: bool = False,
|
|
1362
|
+
count_only: bool = False,
|
|
1363
|
+
) -> sqlalchemy.Select:
|
|
1364
|
+
"""Build a query to get managed jobs from the database with filters."""
|
|
1365
|
+
query = build_managed_jobs_with_filters_no_status_query(
|
|
1366
|
+
fields=fields,
|
|
1367
|
+
job_ids=job_ids,
|
|
1368
|
+
accessible_workspaces=accessible_workspaces,
|
|
1369
|
+
workspace_match=workspace_match,
|
|
1370
|
+
name_match=name_match,
|
|
1371
|
+
pool_match=pool_match,
|
|
1372
|
+
user_hashes=user_hashes,
|
|
1373
|
+
skip_finished=skip_finished,
|
|
1374
|
+
count_only=count_only,
|
|
1375
|
+
)
|
|
1376
|
+
if statuses is not None:
|
|
1377
|
+
query = query.where(spot_table.c.status.in_(statuses))
|
|
1378
|
+
return query
|
|
1379
|
+
|
|
1380
|
+
|
|
1381
|
+
@_init_db
|
|
1382
|
+
def get_status_count_with_filters(
|
|
1383
|
+
fields: Optional[List[str]] = None,
|
|
1384
|
+
job_ids: Optional[List[int]] = None,
|
|
1385
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1386
|
+
workspace_match: Optional[str] = None,
|
|
1387
|
+
name_match: Optional[str] = None,
|
|
1388
|
+
pool_match: Optional[str] = None,
|
|
1389
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1390
|
+
skip_finished: bool = False,
|
|
1391
|
+
) -> Dict[str, int]:
|
|
1392
|
+
"""Get the status count of the managed jobs with filters."""
|
|
1393
|
+
query = build_managed_jobs_with_filters_no_status_query(
|
|
1394
|
+
fields=fields,
|
|
1395
|
+
job_ids=job_ids,
|
|
1396
|
+
accessible_workspaces=accessible_workspaces,
|
|
1397
|
+
workspace_match=workspace_match,
|
|
1398
|
+
name_match=name_match,
|
|
1399
|
+
pool_match=pool_match,
|
|
1400
|
+
user_hashes=user_hashes,
|
|
1401
|
+
skip_finished=skip_finished,
|
|
1402
|
+
status_count=True,
|
|
1403
|
+
)
|
|
1404
|
+
query = query.group_by(spot_table.c.status)
|
|
1405
|
+
results: Dict[str, int] = {}
|
|
1406
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1407
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1408
|
+
rows = session.execute(query).fetchall()
|
|
1409
|
+
for status_value, count in rows:
|
|
1410
|
+
# status_value is already a string (enum value)
|
|
1411
|
+
results[str(status_value)] = int(count)
|
|
1412
|
+
return results
|
|
1413
|
+
|
|
1414
|
+
|
|
1415
|
+
@_init_db
|
|
1416
|
+
def get_managed_jobs_with_filters(
|
|
1417
|
+
fields: Optional[List[str]] = None,
|
|
1418
|
+
job_ids: Optional[List[int]] = None,
|
|
1419
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1420
|
+
workspace_match: Optional[str] = None,
|
|
1421
|
+
name_match: Optional[str] = None,
|
|
1422
|
+
pool_match: Optional[str] = None,
|
|
1423
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1424
|
+
statuses: Optional[List[str]] = None,
|
|
1425
|
+
skip_finished: bool = False,
|
|
1426
|
+
page: Optional[int] = None,
|
|
1427
|
+
limit: Optional[int] = None,
|
|
1428
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
1429
|
+
"""Get managed jobs from the database with filters.
|
|
1430
|
+
|
|
1431
|
+
Returns:
|
|
1432
|
+
A tuple containing
|
|
1433
|
+
- the list of managed jobs
|
|
1434
|
+
- the total number of managed jobs
|
|
1435
|
+
"""
|
|
1436
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1437
|
+
|
|
1438
|
+
count_query = build_managed_jobs_with_filters_query(
|
|
1439
|
+
fields=None,
|
|
1440
|
+
job_ids=job_ids,
|
|
1441
|
+
accessible_workspaces=accessible_workspaces,
|
|
1442
|
+
workspace_match=workspace_match,
|
|
1443
|
+
name_match=name_match,
|
|
1444
|
+
pool_match=pool_match,
|
|
1445
|
+
user_hashes=user_hashes,
|
|
1446
|
+
statuses=statuses,
|
|
1447
|
+
skip_finished=skip_finished,
|
|
1448
|
+
count_only=True,
|
|
1449
|
+
)
|
|
1450
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1451
|
+
total = session.execute(count_query).fetchone()[0]
|
|
1452
|
+
|
|
1453
|
+
query = build_managed_jobs_with_filters_query(
|
|
1454
|
+
fields=fields,
|
|
1455
|
+
job_ids=job_ids,
|
|
1456
|
+
accessible_workspaces=accessible_workspaces,
|
|
1457
|
+
workspace_match=workspace_match,
|
|
1458
|
+
name_match=name_match,
|
|
1459
|
+
pool_match=pool_match,
|
|
1460
|
+
user_hashes=user_hashes,
|
|
1461
|
+
statuses=statuses,
|
|
1462
|
+
skip_finished=skip_finished,
|
|
1463
|
+
)
|
|
1464
|
+
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
|
1465
|
+
spot_table.c.task_id.asc())
|
|
1466
|
+
if page is not None and limit is not None:
|
|
1467
|
+
query = query.offset((page - 1) * limit).limit(limit)
|
|
1468
|
+
rows = None
|
|
1469
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1470
|
+
rows = session.execute(query).fetchall()
|
|
1471
|
+
jobs = []
|
|
1472
|
+
for row in rows:
|
|
1473
|
+
job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
|
|
1474
|
+
if job_dict.get('status') is not None:
|
|
1475
|
+
job_dict['status'] = ManagedJobStatus(job_dict['status'])
|
|
1476
|
+
if job_dict.get('schedule_state') is not None:
|
|
1477
|
+
job_dict['schedule_state'] = ManagedJobScheduleState(
|
|
1478
|
+
job_dict['schedule_state'])
|
|
1479
|
+
if job_dict.get('job_name') is None:
|
|
1480
|
+
job_dict['job_name'] = job_dict.get('task_name')
|
|
1481
|
+
if job_dict.get('metadata') is not None:
|
|
1482
|
+
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
|
1483
|
+
|
|
1484
|
+
# Add user YAML content for managed jobs.
|
|
1485
|
+
job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
|
|
1486
|
+
if job_dict['user_yaml'] is None:
|
|
1487
|
+
# Backwards compatibility - try to read from file path
|
|
1488
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
|
1489
|
+
if yaml_path:
|
|
1490
|
+
try:
|
|
1491
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
1492
|
+
job_dict['user_yaml'] = f.read()
|
|
1493
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
1494
|
+
job_id = job_dict.get('job_id')
|
|
1495
|
+
if job_id is not None:
|
|
1496
|
+
logger.debug('Failed to read original user YAML for '
|
|
1497
|
+
f'job {job_id} from {yaml_path}: {e}')
|
|
1498
|
+
else:
|
|
1499
|
+
logger.debug('Failed to read original user YAML from '
|
|
1500
|
+
f'{yaml_path}: {e}')
|
|
1501
|
+
|
|
1502
|
+
jobs.append(job_dict)
|
|
1503
|
+
return jobs, total
|
|
1504
|
+
|
|
1505
|
+
|
|
1175
1506
|
@_init_db
|
|
1176
1507
|
def get_task_name(job_id: int, task_id: int) -> str:
|
|
1177
1508
|
"""Get the task name of a job."""
|
|
@@ -1212,9 +1543,9 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
|
|
|
1212
1543
|
|
|
1213
1544
|
|
|
1214
1545
|
@_init_db
|
|
1215
|
-
def scheduler_set_waiting(job_id: int,
|
|
1216
|
-
|
|
1217
|
-
|
|
1546
|
+
def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
|
|
1547
|
+
original_user_yaml_content: str,
|
|
1548
|
+
env_file_content: str, priority: int):
|
|
1218
1549
|
"""Do not call without holding the scheduler lock.
|
|
1219
1550
|
|
|
1220
1551
|
Returns: Whether this is a recovery run or not.
|
|
@@ -1226,20 +1557,48 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
|
1226
1557
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1227
1558
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1228
1559
|
updated_count = session.query(job_info_table).filter(
|
|
1229
|
-
sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
})
|
|
1560
|
+
sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)).update({
|
|
1561
|
+
job_info_table.c.schedule_state:
|
|
1562
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1563
|
+
job_info_table.c.dag_yaml_content: dag_yaml_content,
|
|
1564
|
+
job_info_table.c.original_user_yaml_content:
|
|
1565
|
+
(original_user_yaml_content),
|
|
1566
|
+
job_info_table.c.env_file_content: env_file_content,
|
|
1567
|
+
job_info_table.c.priority: priority,
|
|
1568
|
+
})
|
|
1239
1569
|
session.commit()
|
|
1240
1570
|
assert updated_count <= 1, (job_id, updated_count)
|
|
1241
1571
|
|
|
1242
1572
|
|
|
1573
|
+
@_init_db
|
|
1574
|
+
def get_job_file_contents(job_id: int) -> Dict[str, Optional[str]]:
|
|
1575
|
+
"""Return file information and stored contents for a managed job."""
|
|
1576
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1577
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1578
|
+
row = session.execute(
|
|
1579
|
+
sqlalchemy.select(
|
|
1580
|
+
job_info_table.c.dag_yaml_path,
|
|
1581
|
+
job_info_table.c.env_file_path,
|
|
1582
|
+
job_info_table.c.dag_yaml_content,
|
|
1583
|
+
job_info_table.c.env_file_content,
|
|
1584
|
+
).where(job_info_table.c.spot_job_id == job_id)).fetchone()
|
|
1585
|
+
|
|
1586
|
+
if row is None:
|
|
1587
|
+
return {
|
|
1588
|
+
'dag_yaml_path': None,
|
|
1589
|
+
'env_file_path': None,
|
|
1590
|
+
'dag_yaml_content': None,
|
|
1591
|
+
'env_file_content': None,
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
return {
|
|
1595
|
+
'dag_yaml_path': row[0],
|
|
1596
|
+
'env_file_path': row[1],
|
|
1597
|
+
'dag_yaml_content': row[2],
|
|
1598
|
+
'env_file_content': row[3],
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
|
|
1243
1602
|
@_init_db
|
|
1244
1603
|
def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
1245
1604
|
"""Get the pool from the job id."""
|
|
@@ -1251,25 +1610,6 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
|
1251
1610
|
return pool[0] if pool else None
|
|
1252
1611
|
|
|
1253
1612
|
|
|
1254
|
-
@_init_db
|
|
1255
|
-
def get_pool_and_submit_info_from_job_ids(
|
|
1256
|
-
job_ids: Set[int]
|
|
1257
|
-
) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
|
|
1258
|
-
"""Get the pool, cluster name, and job id on pool from job id"""
|
|
1259
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
1260
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1261
|
-
rows = session.execute(
|
|
1262
|
-
sqlalchemy.select(
|
|
1263
|
-
job_info_table.c.spot_job_id, job_info_table.c.pool,
|
|
1264
|
-
job_info_table.c.current_cluster_name,
|
|
1265
|
-
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1266
|
-
job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
|
|
1267
|
-
return {
|
|
1268
|
-
job_id: (pool, cluster_name, job_id_on_pool_cluster)
|
|
1269
|
-
for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
|
|
1270
|
-
}
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
1613
|
@_init_db
|
|
1274
1614
|
def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
1275
1615
|
"""Set the current cluster name for a job."""
|
|
@@ -1644,9 +1984,12 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
|
|
|
1644
1984
|
count = result.rowcount
|
|
1645
1985
|
await session.commit()
|
|
1646
1986
|
if count != 1:
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1987
|
+
details = await _describe_task_transition_failure(
|
|
1988
|
+
session, job_id, task_id)
|
|
1989
|
+
message = ('Failed to set the task to starting. '
|
|
1990
|
+
f'({count} rows updated. {details})')
|
|
1991
|
+
logger.error(message)
|
|
1992
|
+
raise exceptions.ManagedJobStatusError(message)
|
|
1650
1993
|
await callback_func('SUBMITTED')
|
|
1651
1994
|
await callback_func('STARTING')
|
|
1652
1995
|
|
|
@@ -1676,9 +2019,12 @@ async def set_started_async(job_id: int, task_id: int, start_time: float,
|
|
|
1676
2019
|
count = result.rowcount
|
|
1677
2020
|
await session.commit()
|
|
1678
2021
|
if count != 1:
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
2022
|
+
details = await _describe_task_transition_failure(
|
|
2023
|
+
session, job_id, task_id)
|
|
2024
|
+
message = (f'Failed to set the task to started. '
|
|
2025
|
+
f'({count} rows updated. {details})')
|
|
2026
|
+
logger.error(message)
|
|
2027
|
+
raise exceptions.ManagedJobStatusError(message)
|
|
1682
2028
|
await callback_func('STARTED')
|
|
1683
2029
|
|
|
1684
2030
|
|
|
@@ -1733,9 +2079,14 @@ async def set_recovering_async(job_id: int, task_id: int,
|
|
|
1733
2079
|
count = result.rowcount
|
|
1734
2080
|
await session.commit()
|
|
1735
2081
|
if count != 1:
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
2082
|
+
details = await _describe_task_transition_failure(
|
|
2083
|
+
session, job_id, task_id)
|
|
2084
|
+
message = ('Failed to set the task to recovering with '
|
|
2085
|
+
'force_transit_to_recovering='
|
|
2086
|
+
f'{force_transit_to_recovering}. '
|
|
2087
|
+
f'({count} rows updated. {details})')
|
|
2088
|
+
logger.error(message)
|
|
2089
|
+
raise exceptions.ManagedJobStatusError(message)
|
|
1739
2090
|
await callback_func('RECOVERING')
|
|
1740
2091
|
|
|
1741
2092
|
|
|
@@ -1761,9 +2112,12 @@ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
|
|
|
1761
2112
|
count = result.rowcount
|
|
1762
2113
|
await session.commit()
|
|
1763
2114
|
if count != 1:
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
2115
|
+
details = await _describe_task_transition_failure(
|
|
2116
|
+
session, job_id, task_id)
|
|
2117
|
+
message = (f'Failed to set the task to recovered. '
|
|
2118
|
+
f'({count} rows updated. {details})')
|
|
2119
|
+
logger.error(message)
|
|
2120
|
+
raise exceptions.ManagedJobStatusError(message)
|
|
1767
2121
|
logger.info('==== Recovered. ====')
|
|
1768
2122
|
await callback_func('RECOVERED')
|
|
1769
2123
|
|
|
@@ -1788,9 +2142,12 @@ async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
|
|
|
1788
2142
|
count = result.rowcount
|
|
1789
2143
|
await session.commit()
|
|
1790
2144
|
if count != 1:
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
2145
|
+
details = await _describe_task_transition_failure(
|
|
2146
|
+
session, job_id, task_id)
|
|
2147
|
+
message = (f'Failed to set the task to succeeded. '
|
|
2148
|
+
f'({count} rows updated. {details})')
|
|
2149
|
+
logger.error(message)
|
|
2150
|
+
raise exceptions.ManagedJobStatusError(message)
|
|
1794
2151
|
await callback_func('SUCCEEDED')
|
|
1795
2152
|
logger.info('Job succeeded.')
|
|
1796
2153
|
|
|
@@ -1956,8 +2313,13 @@ async def scheduler_set_done_async(job_id: int,
|
|
|
1956
2313
|
|
|
1957
2314
|
|
|
1958
2315
|
@_init_db
|
|
1959
|
-
def set_job_info(job_id: int,
|
|
1960
|
-
|
|
2316
|
+
def set_job_info(job_id: int,
|
|
2317
|
+
name: str,
|
|
2318
|
+
workspace: str,
|
|
2319
|
+
entrypoint: str,
|
|
2320
|
+
pool: Optional[str],
|
|
2321
|
+
pool_hash: Optional[str],
|
|
2322
|
+
user_hash: Optional[str] = None):
|
|
1961
2323
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1962
2324
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1963
2325
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -1976,6 +2338,7 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
|
|
|
1976
2338
|
entrypoint=entrypoint,
|
|
1977
2339
|
pool=pool,
|
|
1978
2340
|
pool_hash=pool_hash,
|
|
2341
|
+
user_hash=user_hash,
|
|
1979
2342
|
)
|
|
1980
2343
|
session.execute(insert_stmt)
|
|
1981
2344
|
session.commit()
|
|
@@ -2029,3 +2392,118 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
|
|
2029
2392
|
rows = session.execute(query).fetchall()
|
|
2030
2393
|
job_ids = [row[0] for row in rows if row[0] is not None]
|
|
2031
2394
|
return job_ids
|
|
2395
|
+
|
|
2396
|
+
|
|
2397
|
+
@_init_db_async
|
|
2398
|
+
async def get_task_logs_to_clean_async(retention_seconds: int,
|
|
2399
|
+
batch_size) -> List[Dict[str, Any]]:
|
|
2400
|
+
"""Get the logs of job tasks to clean.
|
|
2401
|
+
|
|
2402
|
+
The logs of a task will only cleaned when:
|
|
2403
|
+
- the job schedule state is DONE
|
|
2404
|
+
- AND the end time of the task is older than the retention period
|
|
2405
|
+
"""
|
|
2406
|
+
|
|
2407
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2408
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2409
|
+
now = time.time()
|
|
2410
|
+
result = await session.execute(
|
|
2411
|
+
sqlalchemy.select(
|
|
2412
|
+
spot_table.c.spot_job_id,
|
|
2413
|
+
spot_table.c.task_id,
|
|
2414
|
+
spot_table.c.local_log_file,
|
|
2415
|
+
).select_from(
|
|
2416
|
+
spot_table.join(
|
|
2417
|
+
job_info_table,
|
|
2418
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id,
|
|
2419
|
+
)).
|
|
2420
|
+
where(
|
|
2421
|
+
sqlalchemy.and_(
|
|
2422
|
+
job_info_table.c.schedule_state.is_(
|
|
2423
|
+
ManagedJobScheduleState.DONE.value),
|
|
2424
|
+
spot_table.c.end_at.isnot(None),
|
|
2425
|
+
spot_table.c.end_at < (now - retention_seconds),
|
|
2426
|
+
spot_table.c.logs_cleaned_at.is_(None),
|
|
2427
|
+
# The local log file is set AFTER the task is finished,
|
|
2428
|
+
# add this condition to ensure the entire log file has
|
|
2429
|
+
# been written.
|
|
2430
|
+
spot_table.c.local_log_file.isnot(None),
|
|
2431
|
+
)).limit(batch_size))
|
|
2432
|
+
rows = result.fetchall()
|
|
2433
|
+
return [{
|
|
2434
|
+
'job_id': row[0],
|
|
2435
|
+
'task_id': row[1],
|
|
2436
|
+
'local_log_file': row[2]
|
|
2437
|
+
} for row in rows]
|
|
2438
|
+
|
|
2439
|
+
|
|
2440
|
+
@_init_db_async
|
|
2441
|
+
async def get_controller_logs_to_clean_async(
|
|
2442
|
+
retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
|
|
2443
|
+
"""Get the controller logs to clean.
|
|
2444
|
+
|
|
2445
|
+
The controller logs will only cleaned when:
|
|
2446
|
+
- the job schedule state is DONE
|
|
2447
|
+
- AND the end time of the latest task is older than the retention period
|
|
2448
|
+
"""
|
|
2449
|
+
|
|
2450
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2451
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2452
|
+
now = time.time()
|
|
2453
|
+
|
|
2454
|
+
result = await session.execute(
|
|
2455
|
+
sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
|
|
2456
|
+
job_info_table.join(
|
|
2457
|
+
spot_table,
|
|
2458
|
+
job_info_table.c.spot_job_id == spot_table.c.spot_job_id,
|
|
2459
|
+
)).where(
|
|
2460
|
+
sqlalchemy.and_(
|
|
2461
|
+
job_info_table.c.schedule_state.is_(
|
|
2462
|
+
ManagedJobScheduleState.DONE.value),
|
|
2463
|
+
spot_table.c.local_log_file.isnot(None),
|
|
2464
|
+
job_info_table.c.controller_logs_cleaned_at.is_(None),
|
|
2465
|
+
)).group_by(
|
|
2466
|
+
job_info_table.c.spot_job_id,
|
|
2467
|
+
job_info_table.c.current_cluster_name,
|
|
2468
|
+
).having(
|
|
2469
|
+
sqlalchemy.func.max(
|
|
2470
|
+
spot_table.c.end_at).isnot(None),).having(
|
|
2471
|
+
sqlalchemy.func.max(spot_table.c.end_at) < (
|
|
2472
|
+
now - retention_seconds)).limit(batch_size))
|
|
2473
|
+
rows = result.fetchall()
|
|
2474
|
+
return [{'job_id': row[0]} for row in rows]
|
|
2475
|
+
|
|
2476
|
+
|
|
2477
|
+
@_init_db_async
|
|
2478
|
+
async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
|
|
2479
|
+
logs_cleaned_at: float):
|
|
2480
|
+
"""Set the task logs cleaned at."""
|
|
2481
|
+
if not tasks:
|
|
2482
|
+
return
|
|
2483
|
+
# Deduplicate
|
|
2484
|
+
task_keys = list(dict.fromkeys(tasks))
|
|
2485
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2486
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2487
|
+
await session.execute(
|
|
2488
|
+
sqlalchemy.update(spot_table).where(
|
|
2489
|
+
sqlalchemy.tuple_(spot_table.c.spot_job_id,
|
|
2490
|
+
spot_table.c.task_id).in_(task_keys)).values(
|
|
2491
|
+
logs_cleaned_at=logs_cleaned_at))
|
|
2492
|
+
await session.commit()
|
|
2493
|
+
|
|
2494
|
+
|
|
2495
|
+
@_init_db_async
|
|
2496
|
+
async def set_controller_logs_cleaned_async(job_ids: List[int],
|
|
2497
|
+
logs_cleaned_at: float):
|
|
2498
|
+
"""Set the controller logs cleaned at."""
|
|
2499
|
+
if not job_ids:
|
|
2500
|
+
return
|
|
2501
|
+
# Deduplicate
|
|
2502
|
+
job_ids = list(dict.fromkeys(job_ids))
|
|
2503
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2504
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2505
|
+
await session.execute(
|
|
2506
|
+
sqlalchemy.update(job_info_table).where(
|
|
2507
|
+
job_info_table.c.spot_job_id.in_(job_ids)).values(
|
|
2508
|
+
controller_logs_cleaned_at=logs_cleaned_at))
|
|
2509
|
+
await session.commit()
|