toil 8.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +4 -4
- toil/batchSystems/options.py +1 -0
- toil/batchSystems/slurm.py +227 -83
- toil/common.py +161 -45
- toil/cwl/cwltoil.py +31 -10
- toil/job.py +47 -38
- toil/jobStores/aws/jobStore.py +46 -10
- toil/lib/aws/session.py +14 -3
- toil/lib/aws/utils.py +92 -35
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2nodes.py +3 -2
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/io.py +22 -1
- toil/lib/misc.py +18 -0
- toil/lib/retry.py +10 -10
- toil/lib/{integration.py → trs.py} +95 -46
- toil/lib/web.py +38 -0
- toil/options/common.py +17 -2
- toil/options/cwl.py +10 -0
- toil/provisioners/gceProvisioner.py +4 -4
- toil/server/cli/wes_cwl_runner.py +3 -3
- toil/server/utils.py +2 -3
- toil/statsAndLogging.py +35 -1
- toil/test/batchSystems/test_slurm.py +172 -2
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +105 -2
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_trs.py +161 -0
- toil/test/wdl/wdltoil_test.py +1 -1
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +23 -9
- toil/worker.py +113 -33
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/METADATA +9 -4
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/RECORD +40 -34
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil/test/lib/test_integration.py +0 -104
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/lib/history.py
ADDED
|
@@ -0,0 +1,1271 @@
|
|
|
1
|
+
# Copyright (C) 2024 Regents of the University of California
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Contains tools for tracking history.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from contextlib import contextmanager
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import sqlite3
|
|
23
|
+
import sys
|
|
24
|
+
import threading
|
|
25
|
+
import time
|
|
26
|
+
import uuid
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from typing import Any, Iterable, Iterator, Optional, TypeVar, Callable
|
|
29
|
+
|
|
30
|
+
from toil.lib.io import get_toil_home
|
|
31
|
+
from toil.lib.retry import ErrorCondition, retry
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
class HistoryDatabaseSchemaTooNewError(RuntimeError):
|
|
36
|
+
"""
|
|
37
|
+
Raised when we would write to the history database, but its schema is too
|
|
38
|
+
new for us to understand.
|
|
39
|
+
"""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class WorkflowSummary:
|
|
44
|
+
"""
|
|
45
|
+
Data class holding summary information for a workflow.
|
|
46
|
+
|
|
47
|
+
Represents all the attempts to execute one run of a workflow.
|
|
48
|
+
"""
|
|
49
|
+
id: str
|
|
50
|
+
name: Optional[str]
|
|
51
|
+
job_store: str
|
|
52
|
+
total_attempts: int
|
|
53
|
+
total_job_attempts: int
|
|
54
|
+
succeeded: bool
|
|
55
|
+
start_time: Optional[float]
|
|
56
|
+
"""
|
|
57
|
+
Time when the first workflow attempt started, in seconds since epoch.
|
|
58
|
+
|
|
59
|
+
None if there are no attempts recorded.
|
|
60
|
+
"""
|
|
61
|
+
runtime: Optional[float]
|
|
62
|
+
"""
|
|
63
|
+
Time from the first workflow attempt's start to the last one's end, in seconds.
|
|
64
|
+
|
|
65
|
+
None if there are no attempts recorded.
|
|
66
|
+
"""
|
|
67
|
+
trs_spec: Optional[str]
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class WorkflowAttemptSummary:
|
|
71
|
+
"""
|
|
72
|
+
Data class holding summary information for a workflow attempt.
|
|
73
|
+
|
|
74
|
+
Helpfully includes the workflow metadata for Dockstore.
|
|
75
|
+
"""
|
|
76
|
+
workflow_id: str
|
|
77
|
+
attempt_number: int
|
|
78
|
+
succeeded: bool
|
|
79
|
+
start_time: float
|
|
80
|
+
runtime: float
|
|
81
|
+
submitted_to_dockstore: bool
|
|
82
|
+
batch_system: Optional[str]
|
|
83
|
+
caching: Optional[bool]
|
|
84
|
+
toil_version: Optional[str]
|
|
85
|
+
python_version: Optional[str]
|
|
86
|
+
platform_system: Optional[str]
|
|
87
|
+
platform_machine: Optional[str]
|
|
88
|
+
workflow_job_store: str
|
|
89
|
+
workflow_trs_spec: Optional[str]
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class JobAttemptSummary:
|
|
93
|
+
"""
|
|
94
|
+
Data class holding summary information for a job attempt within a known
|
|
95
|
+
workflow attempt.
|
|
96
|
+
"""
|
|
97
|
+
id: str
|
|
98
|
+
job_name: str
|
|
99
|
+
succeeded: bool
|
|
100
|
+
start_time: float
|
|
101
|
+
runtime: float
|
|
102
|
+
submitted_to_dockstore: bool
|
|
103
|
+
cores: Optional[float]
|
|
104
|
+
cpu_seconds: Optional[float]
|
|
105
|
+
memory_bytes: Optional[int]
|
|
106
|
+
disk_bytes: Optional[int]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
RT = TypeVar("RT")
|
|
110
|
+
|
|
111
|
+
def db_retry(function: Callable[..., RT]) -> Callable[..., RT]:
|
|
112
|
+
"""
|
|
113
|
+
Decorate a function with the appropriate retries for accessing the database.
|
|
114
|
+
"""
|
|
115
|
+
return retry(
|
|
116
|
+
infinite_retries=True,
|
|
117
|
+
errors=[
|
|
118
|
+
ErrorCondition(
|
|
119
|
+
error=sqlite3.OperationalError, error_message_must_include="is locked"
|
|
120
|
+
)
|
|
121
|
+
],
|
|
122
|
+
)(function)
|
|
123
|
+
|
|
124
|
+
class HistoryManager:
|
|
125
|
+
"""
|
|
126
|
+
Class responsible for managing the history of Toil runs.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
# Should workflow run history be recorded?
|
|
130
|
+
WORKFLOW_HISTORY_ENABLED = True
|
|
131
|
+
# Should job history be recorded? Can only be true if
|
|
132
|
+
# WORKFLOW_HISTORY_ENABLED is also true.
|
|
133
|
+
#
|
|
134
|
+
# TODO: When Dockstore can take job metrics alongside whole-workflow
|
|
135
|
+
# metrics, and we've tested to make sure history recording doesn't slow
|
|
136
|
+
# down our leader job processing rate, turn on actual job history logging.
|
|
137
|
+
JOB_HISTORY_ENABLED = False
|
|
138
|
+
|
|
139
|
+
# For testing, we can move the database path for the class.
|
|
140
|
+
database_path_override: Optional[str] = None
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def database_path(cls) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Get the path at which the database we store history in lives.
|
|
146
|
+
"""
|
|
147
|
+
if cls.database_path_override is not None:
|
|
148
|
+
# Under test, we can use a temporary path.
|
|
149
|
+
return cls.database_path_override
|
|
150
|
+
|
|
151
|
+
return os.path.join(get_toil_home(), "history.sqlite")
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def connection(cls) -> sqlite3.Connection:
|
|
155
|
+
"""
|
|
156
|
+
Connect to the history database.
|
|
157
|
+
|
|
158
|
+
Caller must not actually use the connection without using
|
|
159
|
+
ensure_tables() to protect reads and updates.
|
|
160
|
+
|
|
161
|
+
Must be called from inside a top-level method marked @db_retry.
|
|
162
|
+
|
|
163
|
+
The connection will be in DEFERRED isolation_level, with autocommit off
|
|
164
|
+
on Python versions that support it. In order to run any commands
|
|
165
|
+
outside of a transaction use the no_transaction context manager.
|
|
166
|
+
"""
|
|
167
|
+
if not os.path.exists(cls.database_path()):
|
|
168
|
+
# Make the database and protect it from snoopers and busybodies
|
|
169
|
+
con = sqlite3.connect(cls.database_path())
|
|
170
|
+
del con
|
|
171
|
+
os.chmod(cls.database_path(), 0o600)
|
|
172
|
+
|
|
173
|
+
con = sqlite3.connect(
|
|
174
|
+
cls.database_path(),
|
|
175
|
+
isolation_level="DEFERRED"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
with cls.no_transaction(con):
|
|
179
|
+
# Turn on foreign keys.
|
|
180
|
+
# This has to be outside any transaction.
|
|
181
|
+
# See <https://stackoverflow.com/q/78898176>
|
|
182
|
+
con.execute("PRAGMA foreign_keys = ON")
|
|
183
|
+
# This has the side effect of definitely leaving autocommit off, which
|
|
184
|
+
# is what we want as the base state.
|
|
185
|
+
|
|
186
|
+
# Set up the connection to use the Row class so that we can look up row values by column name and not just order.
|
|
187
|
+
con.row_factory = sqlite3.Row
|
|
188
|
+
|
|
189
|
+
return con
|
|
190
|
+
|
|
191
|
+
@classmethod
|
|
192
|
+
@contextmanager
|
|
193
|
+
def no_transaction(cls, con: sqlite3.Connection) -> Iterator[None]:
|
|
194
|
+
"""
|
|
195
|
+
Temporarily disable the constant active transaction on the database
|
|
196
|
+
connection, on Python versions where it exists.
|
|
197
|
+
|
|
198
|
+
Commits the current transaction.
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
con.commit()
|
|
202
|
+
if hasattr(con, 'autocommit'):
|
|
203
|
+
con.autocommit = True
|
|
204
|
+
yield
|
|
205
|
+
if hasattr(con, 'autocommit'):
|
|
206
|
+
con.autocommit = False
|
|
207
|
+
|
|
208
|
+
@classmethod
|
|
209
|
+
def ensure_tables(cls, con: sqlite3.Connection, cur: sqlite3.Cursor) -> None:
|
|
210
|
+
"""
|
|
211
|
+
Ensure that tables exist in the database and the schema is migrated to the current version.
|
|
212
|
+
|
|
213
|
+
Leaves the cursor in a transaction where the schema version is known to be correct.
|
|
214
|
+
|
|
215
|
+
Must be called from inside a top-level methodf marked @db_retry.
|
|
216
|
+
|
|
217
|
+
:raises HistoryDatabaseSchemaTooNewError: If the schema is newer than the current version.
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
# Python already puts us in a transaction.
|
|
221
|
+
|
|
222
|
+
# TODO: Do a try-and-fall-back to avoid sending the table schema for
|
|
223
|
+
# this every time we do anything.
|
|
224
|
+
cur.execute("""
|
|
225
|
+
CREATE TABLE IF NOT EXISTS migrations (
|
|
226
|
+
version INT NOT NULL PRIMARY KEY,
|
|
227
|
+
description TEXT
|
|
228
|
+
)
|
|
229
|
+
""")
|
|
230
|
+
db_version = next(cur.execute("SELECT MAX(version) FROM migrations"))[0]
|
|
231
|
+
if db_version is None:
|
|
232
|
+
db_version = -1
|
|
233
|
+
|
|
234
|
+
# This holds pairs of description and command lists.
|
|
235
|
+
# To make a schema change, ADD A NEW PAIR AT THE END, and include
|
|
236
|
+
# statements to adjust existing data.
|
|
237
|
+
migrations = [
|
|
238
|
+
(
|
|
239
|
+
"Make initial tables",
|
|
240
|
+
[
|
|
241
|
+
"""
|
|
242
|
+
CREATE TABLE workflows (
|
|
243
|
+
id TEXT NOT NULL PRIMARY KEY,
|
|
244
|
+
job_store TEXT NOT NULL,
|
|
245
|
+
creation_time REAL NOT NULL,
|
|
246
|
+
name TEXT,
|
|
247
|
+
trs_spec TEXT
|
|
248
|
+
)
|
|
249
|
+
""",
|
|
250
|
+
"""
|
|
251
|
+
CREATE INDEX idx_workflows_by_creation_time
|
|
252
|
+
ON workflows (creation_time)
|
|
253
|
+
""",
|
|
254
|
+
# There's no reference constraint from the job attempts to
|
|
255
|
+
# the workflow attempts because the jobs for a workflow
|
|
256
|
+
# attempt need to go in before the attempt is known to be
|
|
257
|
+
# finished or failed/before the attempt is submittable to
|
|
258
|
+
# Dockstore.
|
|
259
|
+
#
|
|
260
|
+
# TODO: Should we force workflow attempts to be reported on
|
|
261
|
+
# start so that we can have the jobs key-reference them?
|
|
262
|
+
# And so that we always have a start time for the workflow
|
|
263
|
+
# as a whole?
|
|
264
|
+
"""
|
|
265
|
+
CREATE TABLE job_attempts (
|
|
266
|
+
id TEXT NOT NULL PRIMARY KEY,
|
|
267
|
+
workflow_id TEXT NOT NULL,
|
|
268
|
+
workflow_attempt_number INT NOT NULL,
|
|
269
|
+
job_name TEXT NOT NULL,
|
|
270
|
+
succeeded INTEGER NOT NULL,
|
|
271
|
+
start_time REAL NOT NULL,
|
|
272
|
+
runtime REAL NOT NULL,
|
|
273
|
+
cores REAL,
|
|
274
|
+
cpu_seconds REAL,
|
|
275
|
+
memory_bytes INTEGER,
|
|
276
|
+
disk_bytes INTEGER,
|
|
277
|
+
submitted_to_dockstore INTEGER NOT NULL DEFAULT FALSE,
|
|
278
|
+
FOREIGN KEY(workflow_id) REFERENCES workflows(id)
|
|
279
|
+
)
|
|
280
|
+
""",
|
|
281
|
+
"""
|
|
282
|
+
CREATE INDEX idx_job_attempts_by_workflow_attempt
|
|
283
|
+
ON job_attempts (workflow_id, workflow_attempt_number)
|
|
284
|
+
""",
|
|
285
|
+
"""
|
|
286
|
+
CREATE TABLE workflow_attempts (
|
|
287
|
+
workflow_id TEXT NOT NULL,
|
|
288
|
+
attempt_number INTEGER NOT NULL,
|
|
289
|
+
succeeded INTEGER NOT NULL,
|
|
290
|
+
start_time REAL NOT NULL,
|
|
291
|
+
runtime REAL NOT NULL,
|
|
292
|
+
batch_system TEXT,
|
|
293
|
+
caching INTEGER,
|
|
294
|
+
toil_version TEXT,
|
|
295
|
+
python_version TEXT,
|
|
296
|
+
platform_system TEXT,
|
|
297
|
+
platform_machine TEXT,
|
|
298
|
+
submitted_to_dockstore INTEGER NOT NULL DEFAULT FALSE,
|
|
299
|
+
PRIMARY KEY(workflow_id,attempt_number),
|
|
300
|
+
FOREIGN KEY(workflow_id) REFERENCES workflows(id)
|
|
301
|
+
)
|
|
302
|
+
"""
|
|
303
|
+
],
|
|
304
|
+
),
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
if db_version + 1 > len(migrations):
|
|
308
|
+
raise HistoryDatabaseSchemaTooNewError(f"History database version is {db_version}, but known migrations only go up to {len(migrations) - 1}")
|
|
309
|
+
|
|
310
|
+
for migration_number in range(db_version + 1, len(migrations)):
|
|
311
|
+
for statement_number, statement in enumerate(migrations[migration_number][1]):
|
|
312
|
+
# Run all the migration commands.
|
|
313
|
+
# We don't use executescript() because (on old Pythons?) it
|
|
314
|
+
# commits the current transactrion first.
|
|
315
|
+
try:
|
|
316
|
+
cur.execute(statement)
|
|
317
|
+
except sqlite3.OperationalError:
|
|
318
|
+
logger.exception("Could not execute migration %s statement %s: %s", migration_number, statement_number, statement)
|
|
319
|
+
raise
|
|
320
|
+
cur.execute("INSERT INTO migrations VALUES (?, ?)", (migration_number, migrations[migration_number][0]))
|
|
321
|
+
|
|
322
|
+
# If we did have to migrate, leave everything else we do as part of the migration transaction.
|
|
323
|
+
|
|
324
|
+
##
|
|
325
|
+
# Recording Methods
|
|
326
|
+
##
|
|
327
|
+
|
|
328
|
+
@classmethod
|
|
329
|
+
@db_retry
|
|
330
|
+
def record_workflow_creation(cls, workflow_id: str, job_store_spec: str) -> None:
|
|
331
|
+
"""
|
|
332
|
+
Record that a workflow is being run.
|
|
333
|
+
|
|
334
|
+
Takes the Toil config's workflow ID and the location of the job store.
|
|
335
|
+
|
|
336
|
+
Should only be called on the *first* attempt on a job store, not on a
|
|
337
|
+
restart.
|
|
338
|
+
|
|
339
|
+
A workflow may have multiple attempts to run it, some of which succeed
|
|
340
|
+
and others of which fail. Probably only the last one should succeed.
|
|
341
|
+
|
|
342
|
+
:param job_store_spec: The job store specifier for the workflow. Should
|
|
343
|
+
be canonical and always start with the type and a colon. If the
|
|
344
|
+
job store is later moved by the user, the location will not be
|
|
345
|
+
updated.
|
|
346
|
+
"""
|
|
347
|
+
|
|
348
|
+
if not cls.WORKFLOW_HISTORY_ENABLED:
|
|
349
|
+
return
|
|
350
|
+
|
|
351
|
+
logger.info("Recording workflow creation of %s in %s", workflow_id, job_store_spec)
|
|
352
|
+
|
|
353
|
+
con = cls.connection()
|
|
354
|
+
cur = con.cursor()
|
|
355
|
+
try:
|
|
356
|
+
cls.ensure_tables(con, cur)
|
|
357
|
+
cur.execute("INSERT INTO workflows VALUES (?, ?, ?, NULL, NULL)", (workflow_id, job_store_spec, time.time()))
|
|
358
|
+
except:
|
|
359
|
+
con.rollback()
|
|
360
|
+
con.close()
|
|
361
|
+
raise
|
|
362
|
+
else:
|
|
363
|
+
con.commit()
|
|
364
|
+
con.close()
|
|
365
|
+
|
|
366
|
+
# If we raise out of here the connection goes away and the transaction rolls back.
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
@db_retry
|
|
371
|
+
def record_workflow_metadata(cls, workflow_id: str, workflow_name: str, trs_spec: Optional[str] = None) -> None:
|
|
372
|
+
"""
|
|
373
|
+
Associate a name and optionally a TRS ID and version with a workflow run.
|
|
374
|
+
"""
|
|
375
|
+
|
|
376
|
+
# TODO: Make name of this function less general?
|
|
377
|
+
|
|
378
|
+
if not cls.WORKFLOW_HISTORY_ENABLED:
|
|
379
|
+
return
|
|
380
|
+
|
|
381
|
+
logger.info("Workflow %s is a run of %s", workflow_id, workflow_name)
|
|
382
|
+
if trs_spec:
|
|
383
|
+
logger.info("Workflow %s has TRS ID and version %s", workflow_id, trs_spec)
|
|
384
|
+
|
|
385
|
+
con = cls.connection()
|
|
386
|
+
cur = con.cursor()
|
|
387
|
+
try:
|
|
388
|
+
cls.ensure_tables(con, cur)
|
|
389
|
+
cur.execute("UPDATE workflows SET name = ? WHERE id = ?", (workflow_name, workflow_id))
|
|
390
|
+
if trs_spec is not None:
|
|
391
|
+
cur.execute("UPDATE workflows SET trs_spec = ? WHERE id = ?", (trs_spec, workflow_id))
|
|
392
|
+
except:
|
|
393
|
+
con.rollback()
|
|
394
|
+
con.close()
|
|
395
|
+
raise
|
|
396
|
+
else:
|
|
397
|
+
con.commit()
|
|
398
|
+
con.close()
|
|
399
|
+
|
|
400
|
+
@classmethod
|
|
401
|
+
@db_retry
|
|
402
|
+
def record_job_attempt(
|
|
403
|
+
cls,
|
|
404
|
+
workflow_id: str,
|
|
405
|
+
workflow_attempt_number: int,
|
|
406
|
+
job_name: str,
|
|
407
|
+
succeeded: bool,
|
|
408
|
+
start_time: float,
|
|
409
|
+
runtime: float,
|
|
410
|
+
cores: Optional[float] = None,
|
|
411
|
+
cpu_seconds: Optional[float] = None,
|
|
412
|
+
memory_bytes: Optional[int] = None,
|
|
413
|
+
disk_bytes: Optional[int] = None
|
|
414
|
+
) -> None:
|
|
415
|
+
"""
|
|
416
|
+
Record that a job ran in a workflow.
|
|
417
|
+
|
|
418
|
+
Doesn't expect the provided information to uniquely identify the job
|
|
419
|
+
attempt; assigns the job attempt its own unique ID.
|
|
420
|
+
|
|
421
|
+
Thread safe.
|
|
422
|
+
|
|
423
|
+
:param job_name: A human-readable name for the job. Not expected to be
|
|
424
|
+
a job store ID or to necessarily uniquely identify the job within
|
|
425
|
+
the workflow.
|
|
426
|
+
:param start_time: Job execution start time ins econds since epoch.
|
|
427
|
+
:param runtime: Job execution duration in seconds.
|
|
428
|
+
:param cores: Number of CPU cores the job was scheduled on.
|
|
429
|
+
:param cpu_seconds: CPU core-seconds actually consumed.
|
|
430
|
+
:param memory_bytes: Peak observed job memory usage.
|
|
431
|
+
:param disk_bytes: Observed job disk usage.
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
if not cls.WORKFLOW_HISTORY_ENABLED or not cls.JOB_HISTORY_ENABLED:
|
|
435
|
+
return
|
|
436
|
+
|
|
437
|
+
logger.debug("Workflow %s ran job %s", workflow_id, job_name)
|
|
438
|
+
|
|
439
|
+
con = cls.connection()
|
|
440
|
+
cur = con.cursor()
|
|
441
|
+
try:
|
|
442
|
+
cls.ensure_tables(con, cur)
|
|
443
|
+
cur.execute(
|
|
444
|
+
"""
|
|
445
|
+
INSERT INTO job_attempts(
|
|
446
|
+
id,
|
|
447
|
+
workflow_id,
|
|
448
|
+
workflow_attempt_number,
|
|
449
|
+
job_name,
|
|
450
|
+
succeeded,
|
|
451
|
+
start_time,
|
|
452
|
+
runtime,
|
|
453
|
+
cores,
|
|
454
|
+
cpu_seconds,
|
|
455
|
+
memory_bytes,
|
|
456
|
+
disk_bytes
|
|
457
|
+
)
|
|
458
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
459
|
+
""",
|
|
460
|
+
(
|
|
461
|
+
str(uuid.uuid4()),
|
|
462
|
+
workflow_id,
|
|
463
|
+
workflow_attempt_number,
|
|
464
|
+
job_name,
|
|
465
|
+
1 if succeeded else 0,
|
|
466
|
+
start_time,
|
|
467
|
+
runtime,
|
|
468
|
+
cores,
|
|
469
|
+
cpu_seconds,
|
|
470
|
+
memory_bytes,
|
|
471
|
+
disk_bytes,
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
except:
|
|
475
|
+
con.rollback()
|
|
476
|
+
con.close()
|
|
477
|
+
raise
|
|
478
|
+
else:
|
|
479
|
+
con.commit()
|
|
480
|
+
con.close()
|
|
481
|
+
|
|
482
|
+
@classmethod
|
|
483
|
+
@db_retry
|
|
484
|
+
def record_workflow_attempt(
|
|
485
|
+
cls,
|
|
486
|
+
workflow_id: str,
|
|
487
|
+
workflow_attempt_number: int,
|
|
488
|
+
succeeded: bool,
|
|
489
|
+
start_time: float,
|
|
490
|
+
runtime: float,
|
|
491
|
+
batch_system: Optional[str] = None,
|
|
492
|
+
caching: Optional[bool] = None,
|
|
493
|
+
toil_version: Optional[str] = None,
|
|
494
|
+
python_version: Optional[str] = None,
|
|
495
|
+
platform_system: Optional[str] = None,
|
|
496
|
+
platform_machine: Optional[str] = None
|
|
497
|
+
) -> None:
|
|
498
|
+
"""
|
|
499
|
+
Record a workflow attempt (start or restart) having finished or failed.
|
|
500
|
+
|
|
501
|
+
:param batch_system: The Python type name of the batch system implementation used.
|
|
502
|
+
:param caching: Whether Toil filestore-level caching was used.
|
|
503
|
+
:param toil_version: Version of Toil used to run the workflow.
|
|
504
|
+
:param python_version: Version of Python used to run the workflow.
|
|
505
|
+
:param platform_system: OS ("Darwin", "Linux", etc.) used to run the workflow.
|
|
506
|
+
:param platform_machine: CPU type ("AMD64", etc.) used to run the workflow leader.
|
|
507
|
+
"""
|
|
508
|
+
|
|
509
|
+
if not cls.WORKFLOW_HISTORY_ENABLED:
|
|
510
|
+
return
|
|
511
|
+
|
|
512
|
+
logger.info("Workflow %s stopped. Success: %s", workflow_id, succeeded)
|
|
513
|
+
|
|
514
|
+
con = cls.connection()
|
|
515
|
+
cur = con.cursor()
|
|
516
|
+
try:
|
|
517
|
+
cls.ensure_tables(con, cur)
|
|
518
|
+
cur.execute(
|
|
519
|
+
"""
|
|
520
|
+
INSERT INTO workflow_attempts(
|
|
521
|
+
workflow_id,
|
|
522
|
+
attempt_number,
|
|
523
|
+
succeeded,
|
|
524
|
+
start_time,
|
|
525
|
+
runtime,
|
|
526
|
+
batch_system,
|
|
527
|
+
caching,
|
|
528
|
+
toil_version,
|
|
529
|
+
python_version,
|
|
530
|
+
platform_system,
|
|
531
|
+
platform_machine
|
|
532
|
+
)
|
|
533
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
534
|
+
""",
|
|
535
|
+
(
|
|
536
|
+
workflow_id,
|
|
537
|
+
workflow_attempt_number,
|
|
538
|
+
1 if succeeded else 0,
|
|
539
|
+
start_time,
|
|
540
|
+
runtime,
|
|
541
|
+
batch_system,
|
|
542
|
+
caching,
|
|
543
|
+
toil_version,
|
|
544
|
+
python_version,
|
|
545
|
+
platform_system,
|
|
546
|
+
platform_machine
|
|
547
|
+
)
|
|
548
|
+
)
|
|
549
|
+
except:
|
|
550
|
+
con.rollback()
|
|
551
|
+
con.close()
|
|
552
|
+
raise
|
|
553
|
+
else:
|
|
554
|
+
con.commit()
|
|
555
|
+
con.close()
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
##
|
|
559
|
+
# Read methods
|
|
560
|
+
##
|
|
561
|
+
|
|
562
|
+
# We would implement a bunch of iterators and allow follow-up queries, but
|
|
563
|
+
# then we'd have to figure out how to make sure we use one connection and
|
|
564
|
+
# cursor and not block ourselves with the database transaction locks.
|
|
565
|
+
#
|
|
566
|
+
# So instead we always fetch all the information asked for and close out
|
|
567
|
+
# the read transaction before returning.
|
|
568
|
+
#
|
|
569
|
+
# This means the caller has to worry about a workflow vanishing or changing
|
|
570
|
+
# between when it was shown to them and when they ask follow-up questions,
|
|
571
|
+
# but it also means we can't deadlock.
|
|
572
|
+
|
|
573
|
+
@classmethod
|
|
574
|
+
@db_retry
|
|
575
|
+
def summarize_workflows(cls) -> list[WorkflowSummary]:
|
|
576
|
+
"""
|
|
577
|
+
List all known workflows and their summary statistics.
|
|
578
|
+
"""
|
|
579
|
+
|
|
580
|
+
workflows = []
|
|
581
|
+
|
|
582
|
+
con = cls.connection()
|
|
583
|
+
cur = con.cursor()
|
|
584
|
+
try:
|
|
585
|
+
cls.ensure_tables(con, cur)
|
|
586
|
+
cur.execute(
|
|
587
|
+
"""
|
|
588
|
+
SELECT
|
|
589
|
+
workflows.id AS id,
|
|
590
|
+
workflows.name AS name,
|
|
591
|
+
workflows.job_store AS job_store,
|
|
592
|
+
(SELECT count(*) FROM workflow_attempts WHERE workflow_id = workflows.id) AS total_attempts,
|
|
593
|
+
(SELECT count(*) FROM job_attempts WHERE workflow_id = workflows.id) AS total_job_attempts,
|
|
594
|
+
(SELECT min(count(*), 1) FROM workflow_attempts WHERE workflow_id = workflows.id AND succeeded = TRUE) AS succeeded,
|
|
595
|
+
(SELECT min(start_time) FROM workflow_attempts WHERE workflow_id = workflows.id) AS start_time,
|
|
596
|
+
(SELECT max(start_time + runtime) FROM workflow_attempts WHERE workflow_id = workflows.id) AS end_time,
|
|
597
|
+
workflows.trs_spec AS trs_spec
|
|
598
|
+
FROM workflows
|
|
599
|
+
ORDER BY start_time DESC
|
|
600
|
+
"""
|
|
601
|
+
)
|
|
602
|
+
for row in cur:
|
|
603
|
+
workflows.append(
|
|
604
|
+
WorkflowSummary(
|
|
605
|
+
id=row["id"],
|
|
606
|
+
name=row["name"],
|
|
607
|
+
job_store=row["job_store"],
|
|
608
|
+
total_attempts=row["total_attempts"],
|
|
609
|
+
total_job_attempts=row["total_job_attempts"],
|
|
610
|
+
succeeded=(row["succeeded"] == 1),
|
|
611
|
+
start_time=row["start_time"],
|
|
612
|
+
runtime=(row["end_time"] - row["start_time"]) if row["start_time"] is not None and row["end_time"] is not None else None,
|
|
613
|
+
trs_spec=row["trs_spec"]
|
|
614
|
+
)
|
|
615
|
+
)
|
|
616
|
+
except:
|
|
617
|
+
con.rollback()
|
|
618
|
+
con.close()
|
|
619
|
+
raise
|
|
620
|
+
else:
|
|
621
|
+
con.commit()
|
|
622
|
+
con.close()
|
|
623
|
+
|
|
624
|
+
return workflows
|
|
625
|
+
|
|
626
|
+
@classmethod
|
|
627
|
+
@db_retry
|
|
628
|
+
def get_submittable_workflow_attempts(cls, limit: int = sys.maxsize) -> list[WorkflowAttemptSummary]:
|
|
629
|
+
"""
|
|
630
|
+
List all workflow attempts not yet submitted to Dockstore.
|
|
631
|
+
|
|
632
|
+
:param limit: Get no more than this many.
|
|
633
|
+
"""
|
|
634
|
+
|
|
635
|
+
attempts = []
|
|
636
|
+
|
|
637
|
+
con = cls.connection()
|
|
638
|
+
cur = con.cursor()
|
|
639
|
+
try:
|
|
640
|
+
cls.ensure_tables(con, cur)
|
|
641
|
+
cur.execute(
|
|
642
|
+
"""
|
|
643
|
+
SELECT
|
|
644
|
+
workflow_attempts.workflow_id AS workflow_id,
|
|
645
|
+
workflow_attempts.attempt_number AS attempt_number,
|
|
646
|
+
workflow_attempts.succeeded AS succeeded,
|
|
647
|
+
workflow_attempts.start_time AS start_time,
|
|
648
|
+
workflow_attempts.runtime AS runtime,
|
|
649
|
+
workflow_attempts.batch_system AS batch_system,
|
|
650
|
+
workflow_attempts.caching AS caching,
|
|
651
|
+
workflow_attempts.toil_version AS toil_version,
|
|
652
|
+
workflow_attempts.python_version AS python_version,
|
|
653
|
+
workflow_attempts.platform_system AS platform_system,
|
|
654
|
+
workflow_attempts.platform_machine AS platform_machine,
|
|
655
|
+
workflow_attempts.submitted_to_dockstore AS submitted_to_dockstore,
|
|
656
|
+
workflows.job_store AS workflow_job_store,
|
|
657
|
+
workflows.trs_spec AS workflow_trs_spec
|
|
658
|
+
FROM workflow_attempts
|
|
659
|
+
JOIN workflows ON workflow_attempts.workflow_id = workflows.id
|
|
660
|
+
WHERE workflow_attempts.submitted_to_dockstore = FALSE
|
|
661
|
+
AND workflows.trs_spec IS NOT NULL
|
|
662
|
+
ORDER BY start_time DESC
|
|
663
|
+
LIMIT ?
|
|
664
|
+
""",
|
|
665
|
+
(limit,)
|
|
666
|
+
)
|
|
667
|
+
for row in cur:
|
|
668
|
+
attempts.append(
|
|
669
|
+
WorkflowAttemptSummary(
|
|
670
|
+
workflow_id=row["workflow_id"],
|
|
671
|
+
attempt_number=row["attempt_number"],
|
|
672
|
+
succeeded=(row["succeeded"] == 1),
|
|
673
|
+
start_time=row["start_time"],
|
|
674
|
+
runtime=row["runtime"],
|
|
675
|
+
batch_system=row["batch_system"],
|
|
676
|
+
caching=(row["caching"] == 1),
|
|
677
|
+
toil_version=row["toil_version"],
|
|
678
|
+
python_version=row["python_version"],
|
|
679
|
+
platform_system=row["platform_system"],
|
|
680
|
+
platform_machine=row["platform_machine"],
|
|
681
|
+
submitted_to_dockstore=(row["submitted_to_dockstore"] == 1),
|
|
682
|
+
workflow_job_store=row["workflow_job_store"],
|
|
683
|
+
workflow_trs_spec=row["workflow_trs_spec"]
|
|
684
|
+
)
|
|
685
|
+
)
|
|
686
|
+
except:
|
|
687
|
+
con.rollback()
|
|
688
|
+
con.close()
|
|
689
|
+
raise
|
|
690
|
+
else:
|
|
691
|
+
con.commit()
|
|
692
|
+
con.close()
|
|
693
|
+
|
|
694
|
+
return attempts
|
|
695
|
+
|
|
696
|
+
@classmethod
|
|
697
|
+
@db_retry
|
|
698
|
+
def get_workflow_attempts_with_submittable_job_attempts(cls, limit: int = sys.maxsize) -> list[WorkflowAttemptSummary]:
|
|
699
|
+
"""
|
|
700
|
+
Get all workflow attempts that have job attempts not yet submitted to
|
|
701
|
+
Dockstore.
|
|
702
|
+
|
|
703
|
+
The workflow attempts themselves will have finished and been recorded,
|
|
704
|
+
and have TRS IDs.
|
|
705
|
+
|
|
706
|
+
:param limit: Get no more than this many.
|
|
707
|
+
"""
|
|
708
|
+
|
|
709
|
+
attempts = []
|
|
710
|
+
|
|
711
|
+
con = cls.connection()
|
|
712
|
+
cur = con.cursor()
|
|
713
|
+
try:
|
|
714
|
+
cls.ensure_tables(con, cur)
|
|
715
|
+
cur.execute(
|
|
716
|
+
"""
|
|
717
|
+
SELECT
|
|
718
|
+
workflow_attempts.workflow_id AS workflow_id,
|
|
719
|
+
workflow_attempts.attempt_number AS attempt_number,
|
|
720
|
+
workflow_attempts.succeeded AS succeeded,
|
|
721
|
+
workflow_attempts.start_time AS start_time,
|
|
722
|
+
workflow_attempts.runtime AS runtime,
|
|
723
|
+
workflow_attempts.batch_system AS batch_system,
|
|
724
|
+
workflow_attempts.caching AS caching,
|
|
725
|
+
workflow_attempts.toil_version AS toil_version,
|
|
726
|
+
workflow_attempts.python_version AS python_version,
|
|
727
|
+
workflow_attempts.platform_system AS platform_system,
|
|
728
|
+
workflow_attempts.platform_machine AS platform_machine,
|
|
729
|
+
workflow_attempts.submitted_to_dockstore AS submitted_to_dockstore,
|
|
730
|
+
workflows.job_store AS workflow_job_store,
|
|
731
|
+
workflows.trs_spec AS workflow_trs_spec
|
|
732
|
+
FROM (
|
|
733
|
+
SELECT DISTINCT
|
|
734
|
+
workflow_id, workflow_attempt_number
|
|
735
|
+
FROM job_attempts
|
|
736
|
+
WHERE job_attempts.submitted_to_dockstore = FALSE
|
|
737
|
+
) AS found_job_attempts
|
|
738
|
+
JOIN workflows ON found_job_attempts.workflow_id = workflows.id
|
|
739
|
+
JOIN workflow_attempts ON
|
|
740
|
+
found_job_attempts.workflow_id = workflow_attempts.workflow_id
|
|
741
|
+
AND found_job_attempts.workflow_attempt_number = workflow_attempts.attempt_number
|
|
742
|
+
WHERE workflows.trs_spec IS NOT NULL
|
|
743
|
+
LIMIT ?
|
|
744
|
+
""",
|
|
745
|
+
(limit,)
|
|
746
|
+
)
|
|
747
|
+
for row in cur:
|
|
748
|
+
# TODO: Unify row to data class conversion
|
|
749
|
+
attempts.append(
|
|
750
|
+
WorkflowAttemptSummary(
|
|
751
|
+
workflow_id=row["workflow_id"],
|
|
752
|
+
attempt_number=row["attempt_number"],
|
|
753
|
+
succeeded=(row["succeeded"] == 1),
|
|
754
|
+
start_time=row["start_time"],
|
|
755
|
+
runtime=row["runtime"],
|
|
756
|
+
batch_system=row["batch_system"],
|
|
757
|
+
caching=(row["caching"] == 1),
|
|
758
|
+
toil_version=row["toil_version"],
|
|
759
|
+
python_version=row["python_version"],
|
|
760
|
+
platform_system=row["platform_system"],
|
|
761
|
+
platform_machine=row["platform_machine"],
|
|
762
|
+
submitted_to_dockstore=(row["submitted_to_dockstore"] == 1),
|
|
763
|
+
workflow_job_store=row["workflow_job_store"],
|
|
764
|
+
workflow_trs_spec=row["workflow_trs_spec"]
|
|
765
|
+
)
|
|
766
|
+
)
|
|
767
|
+
except:
|
|
768
|
+
con.rollback()
|
|
769
|
+
con.close()
|
|
770
|
+
raise
|
|
771
|
+
else:
|
|
772
|
+
con.commit()
|
|
773
|
+
con.close()
|
|
774
|
+
|
|
775
|
+
return attempts
|
|
776
|
+
|
|
777
|
+
@classmethod
|
|
778
|
+
@db_retry
|
|
779
|
+
def get_workflow_attempt(cls, workflow_id: str, attempt_number: int) -> Optional[WorkflowAttemptSummary]:
|
|
780
|
+
"""
|
|
781
|
+
Get a single (not necessarily unsubmitted, not necessarily TRS-ID-having) workflow attempt summary, if present.
|
|
782
|
+
"""
|
|
783
|
+
|
|
784
|
+
# TODO: Consolidate with the other 2 ways to query workflow attempts!
|
|
785
|
+
|
|
786
|
+
attempts = []
|
|
787
|
+
|
|
788
|
+
con = cls.connection()
|
|
789
|
+
cur = con.cursor()
|
|
790
|
+
try:
|
|
791
|
+
cls.ensure_tables(con, cur)
|
|
792
|
+
cur.execute(
|
|
793
|
+
"""
|
|
794
|
+
SELECT
|
|
795
|
+
workflow_attempts.workflow_id AS workflow_id,
|
|
796
|
+
workflow_attempts.attempt_number AS attempt_number,
|
|
797
|
+
workflow_attempts.succeeded AS succeeded,
|
|
798
|
+
workflow_attempts.start_time AS start_time,
|
|
799
|
+
workflow_attempts.runtime AS runtime,
|
|
800
|
+
workflow_attempts.batch_system AS batch_system,
|
|
801
|
+
workflow_attempts.caching AS caching,
|
|
802
|
+
workflow_attempts.toil_version AS toil_version,
|
|
803
|
+
workflow_attempts.python_version AS python_version,
|
|
804
|
+
workflow_attempts.platform_system AS platform_system,
|
|
805
|
+
workflow_attempts.platform_machine AS platform_machine,
|
|
806
|
+
workflow_attempts.submitted_to_dockstore AS submitted_to_dockstore,
|
|
807
|
+
workflows.job_store AS workflow_job_store,
|
|
808
|
+
workflows.trs_spec AS workflow_trs_spec
|
|
809
|
+
FROM workflow_attempts
|
|
810
|
+
JOIN workflows ON workflow_attempts.workflow_id = workflows.id
|
|
811
|
+
WHERE workflow_id = ?
|
|
812
|
+
AND attempt_number = ?
|
|
813
|
+
ORDER BY start_time DESC
|
|
814
|
+
LIMIT 1
|
|
815
|
+
""",
|
|
816
|
+
(workflow_id, attempt_number)
|
|
817
|
+
)
|
|
818
|
+
for row in cur:
|
|
819
|
+
attempts.append(
|
|
820
|
+
WorkflowAttemptSummary(
|
|
821
|
+
workflow_id=row["workflow_id"],
|
|
822
|
+
attempt_number=row["attempt_number"],
|
|
823
|
+
succeeded=(row["succeeded"] == 1),
|
|
824
|
+
start_time=row["start_time"],
|
|
825
|
+
runtime=row["runtime"],
|
|
826
|
+
batch_system=row["batch_system"],
|
|
827
|
+
caching=(row["caching"] == 1),
|
|
828
|
+
toil_version=row["toil_version"],
|
|
829
|
+
python_version=row["python_version"],
|
|
830
|
+
platform_system=row["platform_system"],
|
|
831
|
+
platform_machine=row["platform_machine"],
|
|
832
|
+
submitted_to_dockstore=(row["submitted_to_dockstore"] == 1),
|
|
833
|
+
workflow_job_store=row["workflow_job_store"],
|
|
834
|
+
workflow_trs_spec=row["workflow_trs_spec"]
|
|
835
|
+
)
|
|
836
|
+
)
|
|
837
|
+
except:
|
|
838
|
+
con.rollback()
|
|
839
|
+
con.close()
|
|
840
|
+
raise
|
|
841
|
+
else:
|
|
842
|
+
con.commit()
|
|
843
|
+
con.close()
|
|
844
|
+
|
|
845
|
+
if len(attempts) == 0:
|
|
846
|
+
# Not found
|
|
847
|
+
return None
|
|
848
|
+
else:
|
|
849
|
+
return attempts[0]
|
|
850
|
+
|
|
851
|
+
@classmethod
|
|
852
|
+
@db_retry
|
|
853
|
+
def get_unsubmitted_job_attempts(cls, workflow_id: str, attempt_number: int) -> list[JobAttemptSummary]:
|
|
854
|
+
"""
|
|
855
|
+
List all job attempts in the given workflow attempt not yet submitted to Dockstore.
|
|
856
|
+
|
|
857
|
+
Doesn't check to make sure the workflow has a TRS ID.
|
|
858
|
+
"""
|
|
859
|
+
|
|
860
|
+
attempts = []
|
|
861
|
+
|
|
862
|
+
con = cls.connection()
|
|
863
|
+
cur = con.cursor()
|
|
864
|
+
try:
|
|
865
|
+
cls.ensure_tables(con, cur)
|
|
866
|
+
cur.execute(
|
|
867
|
+
"""
|
|
868
|
+
SELECT
|
|
869
|
+
id,
|
|
870
|
+
job_name,
|
|
871
|
+
succeeded,
|
|
872
|
+
start_time,
|
|
873
|
+
runtime,
|
|
874
|
+
cores,
|
|
875
|
+
cpu_seconds,
|
|
876
|
+
memory_bytes,
|
|
877
|
+
disk_bytes,
|
|
878
|
+
submitted_to_dockstore
|
|
879
|
+
FROM job_attempts
|
|
880
|
+
WHERE workflow_id = ?
|
|
881
|
+
AND workflow_attempt_number = ?
|
|
882
|
+
AND submitted_to_dockstore = FALSE
|
|
883
|
+
ORDER BY start_time DESC
|
|
884
|
+
""",
|
|
885
|
+
(workflow_id, attempt_number)
|
|
886
|
+
)
|
|
887
|
+
for row in cur:
|
|
888
|
+
attempts.append(
|
|
889
|
+
JobAttemptSummary(
|
|
890
|
+
id=row["id"],
|
|
891
|
+
job_name=row["job_name"],
|
|
892
|
+
succeeded=(row["succeeded"] == 1),
|
|
893
|
+
start_time=row["start_time"],
|
|
894
|
+
runtime=row["runtime"],
|
|
895
|
+
cores=row["cores"],
|
|
896
|
+
cpu_seconds=row["cpu_seconds"],
|
|
897
|
+
memory_bytes=row["memory_bytes"],
|
|
898
|
+
disk_bytes=row["disk_bytes"],
|
|
899
|
+
submitted_to_dockstore=(row["submitted_to_dockstore"] == 1)
|
|
900
|
+
)
|
|
901
|
+
)
|
|
902
|
+
except:
|
|
903
|
+
con.rollback()
|
|
904
|
+
con.close()
|
|
905
|
+
raise
|
|
906
|
+
else:
|
|
907
|
+
con.commit()
|
|
908
|
+
con.close()
|
|
909
|
+
|
|
910
|
+
return attempts
|
|
911
|
+
|
|
912
|
+
###
|
|
913
|
+
# Submission marking methods
|
|
914
|
+
###
|
|
915
|
+
|
|
916
|
+
@classmethod
|
|
917
|
+
@db_retry
|
|
918
|
+
def mark_workflow_attempt_submitted(cls, workflow_id: str, attempt_number: int) -> None:
|
|
919
|
+
"""
|
|
920
|
+
Mark a workflow attempt as having been successfully submitted to Dockstore.
|
|
921
|
+
|
|
922
|
+
Does not mark the workflow attempt's job attempts as submitted.
|
|
923
|
+
"""
|
|
924
|
+
|
|
925
|
+
con = cls.connection()
|
|
926
|
+
cur = con.cursor()
|
|
927
|
+
try:
|
|
928
|
+
cls.ensure_tables(con, cur)
|
|
929
|
+
cur.execute(
|
|
930
|
+
"UPDATE workflow_attempts SET submitted_to_dockstore = TRUE WHERE workflow_id = ? AND attempt_number = ?",
|
|
931
|
+
(workflow_id, attempt_number)
|
|
932
|
+
)
|
|
933
|
+
except:
|
|
934
|
+
con.rollback()
|
|
935
|
+
con.close()
|
|
936
|
+
raise
|
|
937
|
+
else:
|
|
938
|
+
con.commit()
|
|
939
|
+
con.close()
|
|
940
|
+
|
|
941
|
+
@classmethod
|
|
942
|
+
@db_retry
|
|
943
|
+
def mark_job_attempts_submitted(cls, job_attempt_ids: list[str]) -> None:
|
|
944
|
+
"""
|
|
945
|
+
Mark a collection of job attempts as submitted to Dockstore in a single transaction.
|
|
946
|
+
"""
|
|
947
|
+
|
|
948
|
+
con = cls.connection()
|
|
949
|
+
cur = con.cursor()
|
|
950
|
+
try:
|
|
951
|
+
cls.ensure_tables(con, cur)
|
|
952
|
+
for job_attempt_id in job_attempt_ids:
|
|
953
|
+
# Do all the marking in one transaction
|
|
954
|
+
cur.execute(
|
|
955
|
+
"UPDATE job_attempts SET submitted_to_dockstore = TRUE WHERE id = ?",
|
|
956
|
+
(job_attempt_id,)
|
|
957
|
+
)
|
|
958
|
+
except:
|
|
959
|
+
con.rollback()
|
|
960
|
+
con.close()
|
|
961
|
+
raise
|
|
962
|
+
else:
|
|
963
|
+
con.commit()
|
|
964
|
+
con.close()
|
|
965
|
+
|
|
966
|
+
@classmethod
|
|
967
|
+
@db_retry
|
|
968
|
+
def count_workflows(cls) -> int:
|
|
969
|
+
"""
|
|
970
|
+
Count workflows in the database.
|
|
971
|
+
"""
|
|
972
|
+
con = cls.connection()
|
|
973
|
+
cur = con.cursor()
|
|
974
|
+
try:
|
|
975
|
+
cls.ensure_tables(con, cur)
|
|
976
|
+
|
|
977
|
+
cur.execute("SELECT count(*) FROM workflows")
|
|
978
|
+
|
|
979
|
+
count = cur.fetchone()[0]
|
|
980
|
+
assert isinstance(count, int)
|
|
981
|
+
except:
|
|
982
|
+
con.rollback()
|
|
983
|
+
con.close()
|
|
984
|
+
raise
|
|
985
|
+
else:
|
|
986
|
+
con.commit()
|
|
987
|
+
con.close()
|
|
988
|
+
|
|
989
|
+
return count
|
|
990
|
+
|
|
991
|
+
@classmethod
|
|
992
|
+
@db_retry
|
|
993
|
+
def count_workflow_attempts(cls) -> int:
|
|
994
|
+
"""
|
|
995
|
+
Count workflow attempts in the database.
|
|
996
|
+
"""
|
|
997
|
+
con = cls.connection()
|
|
998
|
+
cur = con.cursor()
|
|
999
|
+
try:
|
|
1000
|
+
cls.ensure_tables(con, cur)
|
|
1001
|
+
|
|
1002
|
+
cur.execute("SELECT count(*) FROM workflow_attempts")
|
|
1003
|
+
|
|
1004
|
+
count = cur.fetchone()[0]
|
|
1005
|
+
assert isinstance(count, int)
|
|
1006
|
+
except:
|
|
1007
|
+
con.rollback()
|
|
1008
|
+
con.close()
|
|
1009
|
+
raise
|
|
1010
|
+
else:
|
|
1011
|
+
con.commit()
|
|
1012
|
+
con.close()
|
|
1013
|
+
|
|
1014
|
+
return count
|
|
1015
|
+
|
|
1016
|
+
@classmethod
|
|
1017
|
+
@db_retry
|
|
1018
|
+
def count_job_attempts(cls) -> int:
|
|
1019
|
+
"""
|
|
1020
|
+
Count job attempts in the database.
|
|
1021
|
+
"""
|
|
1022
|
+
con = cls.connection()
|
|
1023
|
+
cur = con.cursor()
|
|
1024
|
+
try:
|
|
1025
|
+
cls.ensure_tables(con, cur)
|
|
1026
|
+
|
|
1027
|
+
cur.execute("SELECT count(*) FROM job_attempts")
|
|
1028
|
+
|
|
1029
|
+
count = cur.fetchone()[0]
|
|
1030
|
+
assert isinstance(count, int)
|
|
1031
|
+
except:
|
|
1032
|
+
con.rollback()
|
|
1033
|
+
con.close()
|
|
1034
|
+
raise
|
|
1035
|
+
else:
|
|
1036
|
+
con.commit()
|
|
1037
|
+
con.close()
|
|
1038
|
+
|
|
1039
|
+
return count
|
|
1040
|
+
|
|
1041
|
+
@classmethod
|
|
1042
|
+
@db_retry
|
|
1043
|
+
def get_fully_submitted_workflow_ids(cls, limit: int = sys.maxsize) -> list[str]:
|
|
1044
|
+
"""
|
|
1045
|
+
Get workflows that have a successful attempt and no unsubmitted attempts or job attempts.
|
|
1046
|
+
"""
|
|
1047
|
+
ids = []
|
|
1048
|
+
|
|
1049
|
+
con = cls.connection()
|
|
1050
|
+
cur = con.cursor()
|
|
1051
|
+
try:
|
|
1052
|
+
cls.ensure_tables(con, cur)
|
|
1053
|
+
|
|
1054
|
+
cur.execute(
|
|
1055
|
+
"""
|
|
1056
|
+
SELECT
|
|
1057
|
+
workflows.id
|
|
1058
|
+
FROM workflows
|
|
1059
|
+
WHERE
|
|
1060
|
+
(
|
|
1061
|
+
SELECT
|
|
1062
|
+
count(*)
|
|
1063
|
+
FROM workflow_attempts
|
|
1064
|
+
WHERE workflow_id = workflows.id
|
|
1065
|
+
AND succeeded = TRUE
|
|
1066
|
+
AND submitted_to_dockstore = TRUE
|
|
1067
|
+
LIMIT 1
|
|
1068
|
+
) = 1
|
|
1069
|
+
AND (
|
|
1070
|
+
SELECT
|
|
1071
|
+
count(*)
|
|
1072
|
+
FROM workflow_attempts
|
|
1073
|
+
WHERE workflow_id = workflows.id
|
|
1074
|
+
AND submitted_to_dockstore = FALSE
|
|
1075
|
+
LIMIT 1
|
|
1076
|
+
) = 0
|
|
1077
|
+
AND (
|
|
1078
|
+
SELECT
|
|
1079
|
+
count(*)
|
|
1080
|
+
FROM job_attempts
|
|
1081
|
+
WHERE workflow_id = workflows.id
|
|
1082
|
+
AND submitted_to_dockstore = FALSE
|
|
1083
|
+
LIMIT 1
|
|
1084
|
+
) = 0
|
|
1085
|
+
LIMIT ?
|
|
1086
|
+
""",
|
|
1087
|
+
(limit,)
|
|
1088
|
+
)
|
|
1089
|
+
for row in cur:
|
|
1090
|
+
ids.append(row["id"])
|
|
1091
|
+
except:
|
|
1092
|
+
con.rollback()
|
|
1093
|
+
con.close()
|
|
1094
|
+
raise
|
|
1095
|
+
else:
|
|
1096
|
+
con.commit()
|
|
1097
|
+
con.close()
|
|
1098
|
+
|
|
1099
|
+
return ids
|
|
1100
|
+
|
|
1101
|
+
@classmethod
|
|
1102
|
+
@db_retry
|
|
1103
|
+
def get_oldest_workflow_ids(cls, limit: int = sys.maxsize) -> list[str]:
|
|
1104
|
+
"""
|
|
1105
|
+
Get workflows that are old.
|
|
1106
|
+
"""
|
|
1107
|
+
|
|
1108
|
+
ids = []
|
|
1109
|
+
|
|
1110
|
+
con = cls.connection()
|
|
1111
|
+
cur = con.cursor()
|
|
1112
|
+
try:
|
|
1113
|
+
cls.ensure_tables(con, cur)
|
|
1114
|
+
|
|
1115
|
+
# We could use a complicated query to bump workflows down the list
|
|
1116
|
+
# if they have been updated by having attempts or job attempts. But
|
|
1117
|
+
# that would mean we'd need to do a lot of querying and live
|
|
1118
|
+
# sorting, whereas using just the creation time lets us use an
|
|
1119
|
+
# index and a limit efficiently.
|
|
1120
|
+
|
|
1121
|
+
cur.execute(
|
|
1122
|
+
"""
|
|
1123
|
+
SELECT
|
|
1124
|
+
id,
|
|
1125
|
+
creation_time
|
|
1126
|
+
FROM workflows
|
|
1127
|
+
ORDER BY creation_time ASC
|
|
1128
|
+
LIMIT ?
|
|
1129
|
+
""",
|
|
1130
|
+
(limit,)
|
|
1131
|
+
)
|
|
1132
|
+
for row in cur:
|
|
1133
|
+
ids.append(row["id"])
|
|
1134
|
+
except:
|
|
1135
|
+
con.rollback()
|
|
1136
|
+
con.close()
|
|
1137
|
+
raise
|
|
1138
|
+
else:
|
|
1139
|
+
con.commit()
|
|
1140
|
+
con.close()
|
|
1141
|
+
|
|
1142
|
+
return ids
|
|
1143
|
+
|
|
1144
|
+
@classmethod
|
|
1145
|
+
@db_retry
|
|
1146
|
+
def delete_workflow(cls, workflow_id: str) -> None:
|
|
1147
|
+
"""
|
|
1148
|
+
Delete a workflow and all its attempts and job attempts.
|
|
1149
|
+
|
|
1150
|
+
Succeeds if the workflow does not exist.
|
|
1151
|
+
"""
|
|
1152
|
+
|
|
1153
|
+
con = cls.connection()
|
|
1154
|
+
cur = con.cursor()
|
|
1155
|
+
try:
|
|
1156
|
+
cls.ensure_tables(con, cur)
|
|
1157
|
+
|
|
1158
|
+
cur.execute("DELETE FROM job_attempts WHERE workflow_id = ?", (workflow_id,))
|
|
1159
|
+
cur.execute("DELETE FROM workflow_attempts WHERE workflow_id = ?", (workflow_id,))
|
|
1160
|
+
cur.execute("DELETE FROM workflows WHERE id = ?", (workflow_id,))
|
|
1161
|
+
except:
|
|
1162
|
+
con.rollback()
|
|
1163
|
+
con.close()
|
|
1164
|
+
raise
|
|
1165
|
+
else:
|
|
1166
|
+
con.commit()
|
|
1167
|
+
con.close()
|
|
1168
|
+
|
|
1169
|
+
@classmethod
|
|
1170
|
+
@db_retry
|
|
1171
|
+
def get_database_byte_size(cls) -> int:
|
|
1172
|
+
"""
|
|
1173
|
+
Get the total number of bytes used by the database.
|
|
1174
|
+
"""
|
|
1175
|
+
|
|
1176
|
+
con = cls.connection()
|
|
1177
|
+
cur = con.cursor()
|
|
1178
|
+
try:
|
|
1179
|
+
cls.ensure_tables(con, cur)
|
|
1180
|
+
|
|
1181
|
+
cur.execute("PRAGMA page_size")
|
|
1182
|
+
page_size = cur.fetchone()[0]
|
|
1183
|
+
assert isinstance(page_size, int)
|
|
1184
|
+
|
|
1185
|
+
cur.execute("PRAGMA page_count")
|
|
1186
|
+
page_count = cur.fetchone()[0]
|
|
1187
|
+
assert isinstance(page_count, int)
|
|
1188
|
+
|
|
1189
|
+
except:
|
|
1190
|
+
con.rollback()
|
|
1191
|
+
con.close()
|
|
1192
|
+
raise
|
|
1193
|
+
else:
|
|
1194
|
+
con.commit()
|
|
1195
|
+
con.close()
|
|
1196
|
+
|
|
1197
|
+
return page_size * page_count
|
|
1198
|
+
|
|
1199
|
+
@classmethod
|
|
1200
|
+
@db_retry
|
|
1201
|
+
def compact_database(cls) -> None:
|
|
1202
|
+
"""
|
|
1203
|
+
Shrink the database to remove unused space.
|
|
1204
|
+
"""
|
|
1205
|
+
|
|
1206
|
+
con = cls.connection()
|
|
1207
|
+
cur = con.cursor()
|
|
1208
|
+
|
|
1209
|
+
# Don't bother making tables; we don't need them for this and they need
|
|
1210
|
+
# a transaction.
|
|
1211
|
+
|
|
1212
|
+
with cls.no_transaction(con):
|
|
1213
|
+
# Do the vacuum outside any transaction, and rely on it to
|
|
1214
|
+
# synchronize appropriately internally.
|
|
1215
|
+
cur.execute("VACUUM")
|
|
1216
|
+
|
|
1217
|
+
con.close()
|
|
1218
|
+
|
|
1219
|
+
@classmethod
|
|
1220
|
+
def enforce_byte_size_limit(cls, limit: int = 100 * 1024 * 1024) -> None:
|
|
1221
|
+
"""
|
|
1222
|
+
Shrink the database until it is smaller than the given limit, or until
|
|
1223
|
+
it is empty, by throwing away workflows.
|
|
1224
|
+
|
|
1225
|
+
Throws data away in a sensible order, least important to most
|
|
1226
|
+
important.
|
|
1227
|
+
"""
|
|
1228
|
+
|
|
1229
|
+
db_size = cls.get_database_byte_size()
|
|
1230
|
+
|
|
1231
|
+
if db_size < limit:
|
|
1232
|
+
# Nothing to do!
|
|
1233
|
+
return
|
|
1234
|
+
|
|
1235
|
+
while db_size > limit:
|
|
1236
|
+
# Look for some things we submitted already
|
|
1237
|
+
target_workflows = cls.get_fully_submitted_workflow_ids(limit=100)
|
|
1238
|
+
if len(target_workflows) == 0:
|
|
1239
|
+
# If there aren't any, do oldest workflows a few at a time
|
|
1240
|
+
# We need to balance the O(n^2)
|
|
1241
|
+
# delete-and-copy-the-whole-db-to-vacuum loop with not wanting
|
|
1242
|
+
# to delete too many workflows we could keep.
|
|
1243
|
+
target_workflows = cls.get_oldest_workflow_ids(limit=10)
|
|
1244
|
+
if len(target_workflows) == 0:
|
|
1245
|
+
# There are no more workflows to delete.
|
|
1246
|
+
break
|
|
1247
|
+
|
|
1248
|
+
for workflow_id in target_workflows:
|
|
1249
|
+
# Delete all the workflows we don't want.
|
|
1250
|
+
cls.delete_workflow(workflow_id)
|
|
1251
|
+
|
|
1252
|
+
# Shrink the DB
|
|
1253
|
+
cls.compact_database()
|
|
1254
|
+
# Re-check the size
|
|
1255
|
+
db_size = cls.get_database_byte_size()
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
@classmethod
|
|
1261
|
+
def database_dump_lines(cls) -> Iterable[str]:
|
|
1262
|
+
"""
|
|
1263
|
+
Yield lines from the database dump.
|
|
1264
|
+
|
|
1265
|
+
For debugging tests.
|
|
1266
|
+
"""
|
|
1267
|
+
return cls.connection().iterdump()
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
|