toil 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. toil/batchSystems/abstractBatchSystem.py +13 -5
  2. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
  3. toil/batchSystems/kubernetes.py +13 -2
  4. toil/batchSystems/mesos/batchSystem.py +33 -2
  5. toil/batchSystems/slurm.py +191 -16
  6. toil/cwl/cwltoil.py +17 -82
  7. toil/fileStores/__init__.py +1 -1
  8. toil/fileStores/abstractFileStore.py +5 -2
  9. toil/fileStores/cachingFileStore.py +1 -1
  10. toil/job.py +30 -14
  11. toil/jobStores/abstractJobStore.py +24 -19
  12. toil/jobStores/aws/jobStore.py +862 -1963
  13. toil/jobStores/aws/utils.py +24 -270
  14. toil/jobStores/googleJobStore.py +25 -9
  15. toil/jobStores/utils.py +0 -327
  16. toil/leader.py +27 -22
  17. toil/lib/aws/config.py +22 -0
  18. toil/lib/aws/s3.py +477 -9
  19. toil/lib/aws/utils.py +22 -33
  20. toil/lib/checksum.py +88 -0
  21. toil/lib/conversions.py +33 -31
  22. toil/lib/directory.py +217 -0
  23. toil/lib/ec2.py +97 -29
  24. toil/lib/exceptions.py +2 -1
  25. toil/lib/expando.py +2 -2
  26. toil/lib/generatedEC2Lists.py +73 -16
  27. toil/lib/io.py +33 -2
  28. toil/lib/memoize.py +21 -7
  29. toil/lib/pipes.py +385 -0
  30. toil/lib/retry.py +1 -1
  31. toil/lib/threading.py +1 -1
  32. toil/lib/web.py +4 -5
  33. toil/provisioners/__init__.py +5 -2
  34. toil/provisioners/aws/__init__.py +43 -36
  35. toil/provisioners/aws/awsProvisioner.py +22 -13
  36. toil/provisioners/node.py +60 -12
  37. toil/resource.py +3 -13
  38. toil/test/__init__.py +14 -16
  39. toil/test/batchSystems/test_slurm.py +103 -14
  40. toil/test/cwl/staging_cat.cwl +27 -0
  41. toil/test/cwl/staging_make_file.cwl +25 -0
  42. toil/test/cwl/staging_workflow.cwl +43 -0
  43. toil/test/cwl/zero_default.cwl +61 -0
  44. toil/test/docs/scripts/tutorial_staging.py +17 -8
  45. toil/test/jobStores/jobStoreTest.py +23 -133
  46. toil/test/lib/aws/test_iam.py +7 -7
  47. toil/test/lib/aws/test_s3.py +30 -33
  48. toil/test/lib/aws/test_utils.py +9 -9
  49. toil/test/provisioners/aws/awsProvisionerTest.py +59 -6
  50. toil/test/src/autoDeploymentTest.py +2 -3
  51. toil/test/src/fileStoreTest.py +89 -87
  52. toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
  53. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
  54. toil/test/utils/toilKillTest.py +35 -28
  55. toil/test/wdl/md5sum/md5sum.json +1 -1
  56. toil/test/wdl/testfiles/gather.wdl +52 -0
  57. toil/test/wdl/wdltoil_test.py +120 -38
  58. toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
  59. toil/utils/toilDebugFile.py +6 -3
  60. toil/utils/toilStats.py +17 -2
  61. toil/version.py +6 -6
  62. toil/wdl/wdltoil.py +1038 -549
  63. toil/worker.py +5 -2
  64. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/METADATA +12 -12
  65. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/RECORD +69 -61
  66. toil/lib/iterables.py +0 -112
  67. toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
  68. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/WHEEL +0 -0
  69. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/entry_points.txt +0 -0
  70. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/licenses/LICENSE +0 -0
  71. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from dataclasses import dataclass
14
15
  import enum
15
16
  import logging
16
17
  import os
@@ -72,10 +73,13 @@ class BatchJobExitReason(enum.IntEnum):
72
73
  except ValueError:
73
74
  return str(value)
74
75
 
75
-
76
- class UpdatedBatchJobInfo(NamedTuple):
76
+ @dataclass
77
+ class UpdatedBatchJobInfo:
77
78
  jobID: int
78
- exitStatus: int
79
+ """
80
+ The Toil batch system ID of the job.
81
+ """
82
+ exitStatus: int = EXIT_STATUS_UNAVAILABLE_VALUE
79
83
  """
80
84
  The exit status (integer value) of the job. 0 implies successful.
81
85
 
@@ -83,8 +87,12 @@ class UpdatedBatchJobInfo(NamedTuple):
83
87
  (e.g. job is lost, or otherwise died but actual exit code was not reported).
84
88
  """
85
89
 
86
- exitReason: Optional[BatchJobExitReason]
87
- wallTime: Union[float, int, None]
90
+ exitReason: Optional[BatchJobExitReason] = None
91
+ wallTime: Union[float, int, None] = None
92
+ backing_id: Optional[str] = None
93
+ """
94
+ The identifier for the job in the backing scheduler, if available.
95
+ """
88
96
 
89
97
 
90
98
  # Information required for worker cleanup on shutdown of the batch system.
@@ -159,14 +159,21 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
159
159
  logger.debug("Running %r", subLine)
160
160
  batchJobID = self.boss.with_retries(self.submitJob, subLine)
161
161
  if self.boss._outbox is not None:
162
- # JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
162
+ # JobID corresponds to the toil version of the jobID,
163
+ # different from the jobstore's idea of the id. batchjobid
164
+ # is what we get from e.g. slurm
163
165
  self.boss._outbox.publish(
164
166
  ExternalBatchIdMessage(
165
167
  jobID, batchJobID, self.boss.__class__.__name__
166
168
  )
167
169
  )
168
170
 
169
- logger.debug("Submitted job %s", str(batchJobID))
171
+ logger.info(
172
+ "Job %s with batch system ID %s queued as job %s",
173
+ jobName,
174
+ jobID,
175
+ str(batchJobID)
176
+ )
170
177
 
171
178
  # Store dict for mapping Toil job ID to batch job ID
172
179
  # TODO: Note that this currently stores a tuple of (batch system
@@ -251,8 +258,8 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
251
258
  self.coalesce_job_exit_codes, batch_job_id_list
252
259
  )
253
260
  # We got the statuses as a batch
254
- for running_job_id, status in zip(running_job_list, statuses):
255
- activity = self._handle_job_status(running_job_id, status, activity)
261
+ for running_job_id, status, backing_id in zip(running_job_list, statuses, batch_job_id_list):
262
+ activity = self._handle_job_status(running_job_id, status, activity, backing_id)
256
263
 
257
264
  self._checkOnJobsCache = activity
258
265
  self._checkOnJobsTimestamp = datetime.now()
@@ -263,6 +270,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
263
270
  job_id: int,
264
271
  status: Union[int, tuple[int, Optional[BatchJobExitReason]], None],
265
272
  activity: bool,
273
+ backing_id: str,
266
274
  ) -> bool:
267
275
  """
268
276
  Helper method for checkOnJobs to handle job statuses
@@ -275,7 +283,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
275
283
  code, reason = status
276
284
  self.updatedJobsQueue.put(
277
285
  UpdatedBatchJobInfo(
278
- jobID=job_id, exitStatus=code, exitReason=reason, wallTime=None
286
+ jobID=job_id,
287
+ exitStatus=code,
288
+ exitReason=reason,
289
+ wallTime=None,
290
+ backing_id=backing_id,
279
291
  )
280
292
  )
281
293
  self.forgetJob(job_id)
@@ -37,6 +37,7 @@ from threading import Condition, Event, RLock, Thread
37
37
  from typing import Any, Callable, Literal, Optional, TypeVar, Union, cast, overload
38
38
 
39
39
  from toil.lib.conversions import opt_strtobool
40
+ from toil.lib.throttle import LocalThrottle
40
41
 
41
42
  if sys.version_info < (3, 10):
42
43
  from typing_extensions import ParamSpec
@@ -281,6 +282,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
281
282
  # in the queue or any resource becomes available.
282
283
  self._work_available: Condition = Condition(lock=self._mutex)
283
284
 
285
+ # To make sure we don't spam the log when the metrics server is down,
286
+ # we use a throttle
287
+ self._metrics_throttle: LocalThrottle = LocalThrottle(600)
288
+
284
289
  self.schedulingThread: Thread = Thread(target=self._scheduler, daemon=True)
285
290
  self.schedulingThread.start()
286
291
 
@@ -1363,7 +1368,8 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1363
1368
  # This is the sort of error we would expect from an overloaded
1364
1369
  # Kubernetes or a dead metrics service.
1365
1370
  # We can't tell that the pod is stuck, so say that it isn't.
1366
- logger.warning("Could not query metrics service: %s", e)
1371
+ if self._metrics_throttle.throttle(False):
1372
+ logger.warning("Kubernetes metrics service is not available: %s", e)
1367
1373
  return False
1368
1374
  else:
1369
1375
  raise
@@ -1602,6 +1608,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1602
1608
  exitStatus=exitCode,
1603
1609
  wallTime=runtime,
1604
1610
  exitReason=exitReason,
1611
+ backing_id=jobObject.metadata.name,
1605
1612
  )
1606
1613
 
1607
1614
  if (exitReason == BatchJobExitReason.FAILED) or (
@@ -1855,7 +1862,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1855
1862
 
1856
1863
  # Return the one finished job we found
1857
1864
  return UpdatedBatchJobInfo(
1858
- jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None
1865
+ jobID=jobID,
1866
+ exitStatus=exitCode,
1867
+ wallTime=runtime,
1868
+ exitReason=None,
1869
+ backing_id=jobObject.metadata.name,
1859
1870
  )
1860
1871
 
1861
1872
  def _waitForJobDeath(self, jobName: str) -> None:
@@ -103,6 +103,9 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
103
103
  if config.mesos_framework_id is not None:
104
104
  self.mesos_framework_id = config.mesos_framework_id
105
105
 
106
+ # How long in seconds to wait to register before declaring Mesos unreachable.
107
+ self.mesos_timeout = 60
108
+
106
109
  # Written to when Mesos kills tasks, as directed by Toil.
107
110
  # Jobs must not enter this set until they are removed from runningJobMap.
108
111
  self.killedJobIds = set()
@@ -345,17 +348,38 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
345
348
  framework.roles = config.mesos_role
346
349
  framework.capabilities = [dict(type="MULTI_ROLE")]
347
350
 
351
+ endpoint = self._resolveAddress(self.mesos_endpoint)
352
+ log.info("Connecting to Mesos at %s...", self.mesos_endpoint)
353
+
348
354
  # Make the driver which implements most of the scheduler logic and calls back to us for the user-defined parts.
349
355
  # Make sure it will call us with nice namespace-y addicts
350
356
  self.driver = MesosSchedulerDriver(
351
357
  self,
352
358
  framework,
353
- self._resolveAddress(self.mesos_endpoint),
359
+ endpoint,
354
360
  use_addict=True,
355
361
  implicit_acknowledgements=True,
356
362
  )
357
363
  self.driver.start()
358
364
 
365
+ wait_count = 0
366
+ while self.frameworkId is None:
367
+ # Wait to register with Mesos, and eventually fail if it just isn't
368
+ # responding.
369
+
370
+ # TODO: Use a condition instead of a spin wait.
371
+
372
+ if wait_count >= self.mesos_timeout:
373
+ error_message = f"Could not connect to Mesos endpoint at {self.mesos_endpoint}"
374
+ log.error(error_message)
375
+ self.shutdown()
376
+ raise RuntimeError(error_message)
377
+ elif wait_count > 1 and wait_count % 10 == 0:
378
+ log.warning("Waiting for Mesos registration (try %s/%s)", wait_count, self.mesos_timeout)
379
+ time.sleep(1)
380
+ wait_count += 1
381
+
382
+
359
383
  @staticmethod
360
384
  def _resolveAddress(address):
361
385
  """
@@ -394,10 +418,17 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
394
418
  """
395
419
  Invoked when the scheduler successfully registers with a Mesos master
396
420
  """
397
- log.debug("Registered with framework ID %s", frameworkId.value)
421
+ log.info("Registered with Mesos as framework ID %s", frameworkId.value)
398
422
  # Save the framework ID
399
423
  self.frameworkId = frameworkId.value
400
424
 
425
+ def error(self, driver, message):
426
+ """
427
+ Invoked when Mesos reports an unrecoverable error.
428
+ """
429
+ log.error("Mesos error: %s", message)
430
+ super().error(driver, message)
431
+
401
432
  def _declineAllOffers(self, driver, offers):
402
433
  for offer in offers:
403
434
  driver.declineOffer(offer.id)
@@ -18,9 +18,11 @@ import logging
18
18
  import math
19
19
  import os
20
20
  import sys
21
- from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
22
21
  import shlex
23
- from typing import Callable, NamedTuple, TypeVar
22
+
23
+ from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
24
+ from datetime import datetime, timedelta, timezone
25
+ from typing import Callable, NamedTuple, Optional, TypeVar
24
26
 
25
27
  from toil.batchSystems.abstractBatchSystem import (
26
28
  EXIT_STATUS_UNAVAILABLE_VALUE,
@@ -350,9 +352,18 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
350
352
  ) -> list[int | tuple[int, BatchJobExitReason | None] | None]:
351
353
  """
352
354
  Collect all job exit codes in a single call.
353
- :param batch_job_id_list: list of Job ID strings, where each string has the form
354
- "<job>[.<task>]".
355
- :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
355
+
356
+ :param batch_job_id_list: list of Job ID strings, where each string
357
+ has the form ``<job>[.<task>]``.
358
+
359
+ :return: list of job exit codes or exit code, exit reason pairs
360
+ associated with the list of job IDs.
361
+
362
+ :raises CalledProcessErrorStderr: if communicating with Slurm went
363
+ wrong.
364
+
365
+ :raises OSError: if job details are not available becasue a Slurm
366
+ command could not start.
356
367
  """
357
368
  logger.log(
358
369
  TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list
@@ -387,15 +398,54 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
387
398
  Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
388
399
  Fetch job details from Slurm's accounting system or job control system.
389
400
  :param job_id_list: list of integer Job IDs.
390
- :return: dict of job statuses, where key is the integer job ID, and value is a tuple
391
- containing the job's state and exit code.
401
+ :return: dict of job statuses, where key is the integer job ID, and
402
+ value is a tuple containing the job's state and exit code.
403
+ :raises CalledProcessErrorStderr: if communicating with Slurm went
404
+ wrong.
405
+ :raises OSError: if job details are not available becasue a Slurm
406
+ command could not start.
392
407
  """
408
+
409
+ status_dict = {}
410
+ scontrol_problem: Optional[Exception] = None
411
+
412
+ try:
413
+ # Get all the job details we can from scontrol, which we think
414
+ # might be faster/less dangerous than sacct searching, even
415
+ # though it can't be aimed at more than one job.
416
+ status_dict.update(self._getJobDetailsFromScontrol(job_id_list))
417
+ except (CalledProcessErrorStderr, OSError) as e:
418
+ if isinstance(e, OSError):
419
+ logger.warning("Could not run scontrol: %s", e)
420
+ else:
421
+ logger.warning("Error from scontrol: %s", e)
422
+ scontrol_problem = e
423
+
424
+ logger.debug("After scontrol, got statuses: %s", status_dict)
425
+
426
+ # See what's not handy in scontrol (or everything if we couldn't
427
+ # call it).
428
+ sacct_job_id_list = self._remaining_jobs(job_id_list, status_dict)
429
+
430
+ logger.debug("Remaining jobs to find out about: %s", sacct_job_id_list)
431
+
393
432
  try:
394
- status_dict = self._getJobDetailsFromSacct(job_id_list)
433
+ # Ask sacct about those jobs
434
+ status_dict.update(self._getJobDetailsFromSacct(sacct_job_id_list))
395
435
  except (CalledProcessErrorStderr, OSError) as e:
396
436
  if isinstance(e, OSError):
397
437
  logger.warning("Could not run sacct: %s", e)
398
- status_dict = self._getJobDetailsFromScontrol(job_id_list)
438
+ else:
439
+ logger.warning("Error from sacct: %s", e)
440
+ if scontrol_problem is not None:
441
+ # Neither approach worked at all
442
+ raise
443
+
444
+ # One of the methods worked, so we have at least (None, None)
445
+ # values filled in for all jobs.
446
+ assert len(status_dict) == len(job_id_list)
447
+
448
+
399
449
  return status_dict
400
450
 
401
451
  def _get_job_return_code(
@@ -466,15 +516,123 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
466
516
 
467
517
  return state_token
468
518
 
519
+ def _remaining_jobs(self, job_id_list: list[int], job_details: dict[int, tuple[str | None, int | None]]) -> list[int]:
520
+ """
521
+ Given a list of job IDs and a list of job details (state and exit
522
+ code), get the list of job IDs where the details are (None, None)
523
+ (or are missing).
524
+ """
525
+ return [
526
+ j
527
+ for j in job_id_list
528
+ if job_details.get(j, (None, None)) == (None, None)
529
+ ]
530
+
469
531
  def _getJobDetailsFromSacct(
470
- self, job_id_list: list[int]
532
+ self,
533
+ job_id_list: list[int],
534
+ ) -> dict[int, tuple[str | None, int | None]]:
535
+ """
536
+ Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
537
+
538
+ Handles querying manageable time periods until all jobs have information.
539
+
540
+ There is no guarantee of inter-job consistency: one job may really
541
+ finish after another, but we might see the earlier-finishing job
542
+ still running and the later-finishing job finished.
543
+
544
+ :param job_id_list: list of integer batch job IDs.
545
+ :return: dict of job statuses, where key is the job-id, and value
546
+ is a tuple containing the job's state and exit code. Jobs with
547
+ no information reported from Slurm will have (None, None).
548
+ """
549
+
550
+ # Pick a now
551
+ now = datetime.now().astimezone(None)
552
+ # Decide when to start the search (first copy of past midnight)
553
+ begin_time = now.replace(
554
+ hour=0,
555
+ minute=0,
556
+ second=0,
557
+ microsecond=0,
558
+ fold=0
559
+ )
560
+ # And when to end (a day after that)
561
+ end_time = begin_time + timedelta(days=1)
562
+ while end_time < now:
563
+ # If something goes really weird, advance up to our chosen now
564
+ end_time += timedelta(days=1)
565
+ # If we don't go around the loop at least once, we might end up
566
+ # with an empty dict being returned, which shouldn't happen. We
567
+ # need the (None, None) entries for jobs we can't find.
568
+ assert end_time >= self.boss.start_time
569
+
570
+ results: dict[int, tuple[str | None, int | None]] = {}
571
+
572
+ while len(job_id_list) > 0 and end_time >= self.boss.start_time:
573
+ # There are still jobs to look for and our search isn't
574
+ # exclusively for stuff that only existed before our workflow
575
+ # started.
576
+ results.update(
577
+ self._get_job_details_from_sacct_for_range(
578
+ job_id_list,
579
+ begin_time,
580
+ end_time
581
+ )
582
+ )
583
+ job_id_list = self._remaining_jobs(job_id_list, results)
584
+ # If we have to search again, search the previous day. But
585
+ # overlap a tiny bit so the endpoints don't exactly match, in
586
+ # case Slurm is not working with inclusive intervals.
587
+ # TODO: is Slurm working with inclusive intervals?
588
+ end_time = begin_time + timedelta(seconds=1)
589
+ begin_time = end_time - timedelta(days=1, seconds=1)
590
+
591
+
592
+ if end_time < self.boss.start_time and len(job_id_list) > 0:
593
+ # This is suspicious.
594
+ logger.warning(
595
+ "Could not find any information from sacct after "
596
+ "workflow start at %s about jobs: %s",
597
+ self.boss.start_time.isoformat(),
598
+ job_id_list
599
+ )
600
+
601
+ return results
602
+
603
+ def _get_job_details_from_sacct_for_range(
604
+ self,
605
+ job_id_list: list[int],
606
+ begin_time: datetime,
607
+ end_time: datetime,
471
608
  ) -> dict[int, tuple[str | None, int | None]]:
472
609
  """
473
610
  Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
611
+
612
+ Internally, Slurm's accounting thinks in wall clock time, so for
613
+ efficiency you need to only search relevant real-time periods.
614
+
474
615
  :param job_id_list: list of integer batch job IDs.
475
- :return: dict of job statuses, where key is the job-id, and value is a tuple
476
- containing the job's state and exit code.
616
+ :param begin_time: An aware datetime of the earliest time to search
617
+ :param end_time: An aware datetime of the latest time to search
618
+ :return: dict of job statuses, where key is the job-id, and value
619
+ is a tuple containing the job's state and exit code. Jobs with
620
+ no information reported from Slurm will have (None, None).
477
621
  """
622
+
623
+ assert begin_time.tzinfo is not None, "begin_time must be aware"
624
+ assert end_time.tzinfo is not None, "end_time must be aware"
625
+ def stringify(t: datetime) -> str:
626
+ """
627
+ Convert an aware time local time, and format it *without* a
628
+ trailing time zone indicator.
629
+ """
630
+ # TODO: What happens when we get an aware time that's ambiguous
631
+ # in local time? Or when the local timezone changes while we're
632
+ # sending things to Slurm or doing a progressive search back?
633
+ naive_t = t.astimezone(None).replace(tzinfo=None)
634
+ return naive_t.isoformat(timespec="seconds")
635
+
478
636
  job_ids = ",".join(str(id) for id in job_id_list)
479
637
  args = [
480
638
  "sacct",
@@ -485,8 +643,10 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
485
643
  "JobIDRaw,State,ExitCode", # specify output columns
486
644
  "-P", # separate columns with pipes
487
645
  "-S",
488
- "1970-01-01",
489
- ] # override start time limit
646
+ stringify(begin_time),
647
+ "-E",
648
+ stringify(end_time),
649
+ ]
490
650
 
491
651
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
492
652
  # job state and exit status. Initialize dict before processing output of `sacct`.
@@ -500,8 +660,20 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
500
660
  if len(job_id_list) == 1:
501
661
  # 1 is too big, we can't recurse further, bail out
502
662
  raise
503
- job_statuses.update(self._getJobDetailsFromSacct(job_id_list[:len(job_id_list)//2]))
504
- job_statuses.update(self._getJobDetailsFromSacct(job_id_list[len(job_id_list)//2:]))
663
+ job_statuses.update(
664
+ self._get_job_details_from_sacct_for_range(
665
+ job_id_list[:len(job_id_list)//2],
666
+ begin_time,
667
+ end_time,
668
+ )
669
+ )
670
+ job_statuses.update(
671
+ self._get_job_details_from_sacct_for_range(
672
+ job_id_list[len(job_id_list)//2:],
673
+ begin_time,
674
+ end_time,
675
+ )
676
+ )
505
677
  return job_statuses
506
678
  else:
507
679
  raise
@@ -847,6 +1019,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
847
1019
  ) -> None:
848
1020
  super().__init__(config, maxCores, maxMemory, maxDisk)
849
1021
  self.partitions = SlurmBatchSystem.PartitionSet()
1022
+ # Record when the workflow started, so we know when to stop looking for
1023
+ # jobs we ran.
1024
+ self.start_time = datetime.now().astimezone(None)
850
1025
 
851
1026
  # Override issuing jobs so we can check if we need to use Slurm's magic
852
1027
  # whole-node-memory feature.
toil/cwl/cwltoil.py CHANGED
@@ -110,6 +110,11 @@ from toil.batchSystems.abstractBatchSystem import InsufficientSystemResources
110
110
  from toil.batchSystems.registry import DEFAULT_BATCH_SYSTEM
111
111
  from toil.common import Config, Toil, addOptions
112
112
  from toil.cwl import check_cwltool_version
113
+ from toil.lib.directory import (
114
+ DirectoryContents,
115
+ decode_directory,
116
+ encode_directory,
117
+ )
113
118
  from toil.lib.trs import resolve_workflow
114
119
  from toil.lib.misc import call_command
115
120
  from toil.provisioners.clusterScaler import JobTooBigError
@@ -1156,7 +1161,7 @@ class ToilCommandLineTool(ToilTool, cwltool.command_line_tool.CommandLineTool):
1156
1161
  """Subclass the cwltool command line tool to provide the custom ToilPathMapper."""
1157
1162
 
1158
1163
  def _initialworkdir(
1159
- self, j: cwltool.job.JobBase, builder: cwltool.builder.Builder
1164
+ self, j: Optional[cwltool.job.JobBase], builder: cwltool.builder.Builder
1160
1165
  ) -> None:
1161
1166
  """
1162
1167
  Hook the InitialWorkDirRequirement setup to make sure that there are no
@@ -1166,6 +1171,9 @@ class ToilCommandLineTool(ToilTool, cwltool.command_line_tool.CommandLineTool):
1166
1171
  # Set up the initial work dir with all its files
1167
1172
  super()._initialworkdir(j, builder)
1168
1173
 
1174
+ if j is None:
1175
+ return # Only testing
1176
+
1169
1177
  # The initial work dir listing is now in j.generatefiles["listing"]
1170
1178
  # Also j.generatefiles is a CWL Directory.
1171
1179
  # So check the initial working directory.
@@ -1219,79 +1227,6 @@ def toil_make_tool(
1219
1227
  # URI instead of raising an error right away, in case it is optional.
1220
1228
  MISSING_FILE = "missing://"
1221
1229
 
1222
- DirectoryContents = dict[str, Union[str, "DirectoryContents"]]
1223
-
1224
-
1225
- def check_directory_dict_invariants(contents: DirectoryContents) -> None:
1226
- """
1227
- Make sure a directory structure dict makes sense. Throws an error
1228
- otherwise.
1229
-
1230
- Currently just checks to make sure no empty-string keys exist.
1231
- """
1232
-
1233
- for name, item in contents.items():
1234
- if name == "":
1235
- raise RuntimeError(
1236
- "Found nameless entry in directory: " + json.dumps(contents, indent=2)
1237
- )
1238
- if isinstance(item, dict):
1239
- check_directory_dict_invariants(item)
1240
-
1241
-
1242
- def decode_directory(
1243
- dir_path: str,
1244
- ) -> tuple[DirectoryContents, Optional[str], str]:
1245
- """
1246
- Decode a directory from a "toildir:" path to a directory (or a file in it).
1247
-
1248
- Returns the decoded directory dict, the remaining part of the path (which may be
1249
- None), and the deduplication key string that uniquely identifies the
1250
- directory.
1251
- """
1252
- if not dir_path.startswith("toildir:"):
1253
- raise RuntimeError(f"Cannot decode non-directory path: {dir_path}")
1254
-
1255
- # We will decode the directory and then look inside it
1256
-
1257
- # Since this was encoded by upload_directory we know the
1258
- # next piece is encoded JSON describing the directory structure,
1259
- # and it can't contain any slashes.
1260
- parts = dir_path[len("toildir:") :].split("/", 1)
1261
-
1262
- # Before the first slash is the encoded data describing the directory contents
1263
- dir_data = parts[0]
1264
-
1265
- # Decode what to download
1266
- contents = json.loads(
1267
- base64.urlsafe_b64decode(dir_data.encode("utf-8")).decode("utf-8")
1268
- )
1269
-
1270
- check_directory_dict_invariants(contents)
1271
-
1272
- if len(parts) == 1 or parts[1] == "/":
1273
- # We didn't have any subdirectory
1274
- return contents, None, dir_data
1275
- else:
1276
- # We have a path below this
1277
- return contents, parts[1], dir_data
1278
-
1279
-
1280
- def encode_directory(contents: DirectoryContents) -> str:
1281
- """
1282
- Encode a directory from a "toildir:" path to a directory (or a file in it).
1283
-
1284
- Takes the directory dict, which is a dict from name to URI for a file or
1285
- dict for a subdirectory.
1286
- """
1287
-
1288
- check_directory_dict_invariants(contents)
1289
-
1290
- return "toildir:" + base64.urlsafe_b64encode(
1291
- json.dumps(contents).encode("utf-8")
1292
- ).decode("utf-8")
1293
-
1294
-
1295
1230
  class ToilFsAccess(StdFsAccess):
1296
1231
  """
1297
1232
  Custom filesystem access class which handles toil filestore references.
@@ -1360,7 +1295,7 @@ class ToilFsAccess(StdFsAccess):
1360
1295
 
1361
1296
  # Decode its contents, the path inside it to the file (if any), and
1362
1297
  # the key to use for caching the directory.
1363
- contents, subpath, cache_key = decode_directory(path)
1298
+ contents, subpath, cache_key, _, _ = decode_directory(path)
1364
1299
  logger.debug("Decoded directory contents: %s", contents)
1365
1300
 
1366
1301
  if cache_key not in self.dir_to_download:
@@ -1462,7 +1397,7 @@ class ToilFsAccess(StdFsAccess):
1462
1397
  # Handle local files
1463
1398
  return open(self._abs(fn), mode)
1464
1399
  elif parse.scheme == "toildir":
1465
- contents, subpath, cache_key = decode_directory(fn)
1400
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1466
1401
  if cache_key in self.dir_to_download:
1467
1402
  # This is already available locally, so fall back on the local copy
1468
1403
  return open(self._abs(fn), mode)
@@ -1503,7 +1438,7 @@ class ToilFsAccess(StdFsAccess):
1503
1438
  except NoSuchFileException:
1504
1439
  return False
1505
1440
  elif parse.scheme == "toildir":
1506
- contents, subpath, cache_key = decode_directory(path)
1441
+ contents, subpath, cache_key, _, _ = decode_directory(path)
1507
1442
  if subpath is None:
1508
1443
  # The toildir directory itself exists
1509
1444
  return True
@@ -1530,7 +1465,7 @@ class ToilFsAccess(StdFsAccess):
1530
1465
  elif parse.scheme == "toildir":
1531
1466
  # Decode its contents, the path inside it to the file (if any), and
1532
1467
  # the key to use for caching the directory.
1533
- contents, subpath, cache_key = decode_directory(path)
1468
+ contents, subpath, cache_key, _, _ = decode_directory(path)
1534
1469
 
1535
1470
  # We can't get the size of just a directory.
1536
1471
  if subpath is None:
@@ -1564,7 +1499,7 @@ class ToilFsAccess(StdFsAccess):
1564
1499
  # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1565
1500
  return True
1566
1501
  elif parse.scheme == "toildir":
1567
- contents, subpath, cache_key = decode_directory(fn)
1502
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1568
1503
  if subpath is None:
1569
1504
  # This is the toildir directory itself
1570
1505
  return False
@@ -1583,7 +1518,7 @@ class ToilFsAccess(StdFsAccess):
1583
1518
  elif parse.scheme == "toilfile":
1584
1519
  return False
1585
1520
  elif parse.scheme == "toildir":
1586
- contents, subpath, cache_key = decode_directory(fn)
1521
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1587
1522
  if subpath is None:
1588
1523
  # This is the toildir directory itself.
1589
1524
  # TODO: We assume directories can't be deleted.
@@ -1611,7 +1546,7 @@ class ToilFsAccess(StdFsAccess):
1611
1546
  elif parse.scheme == "toilfile":
1612
1547
  raise RuntimeError(f"Cannot list a file: {fn}")
1613
1548
  elif parse.scheme == "toildir":
1614
- contents, subpath, cache_key = decode_directory(fn)
1549
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1615
1550
  here = contents
1616
1551
  if subpath is not None:
1617
1552
  got = get_from_structure(contents, subpath)
@@ -2402,7 +2337,7 @@ def toilStageFiles(
2402
2337
 
2403
2338
  if file_id_or_contents.startswith("toildir:"):
2404
2339
  # Get the directory contents and the path into them, if any
2405
- here, subpath, _ = decode_directory(file_id_or_contents)
2340
+ here, subpath, _, _, _ = decode_directory(file_id_or_contents)
2406
2341
  if subpath is not None:
2407
2342
  for part in subpath.split("/"):
2408
2343
  here = cast(DirectoryContents, here[part])
@@ -28,7 +28,7 @@ class FileID(str):
28
28
  the job store if unavailable in the ID.
29
29
  """
30
30
 
31
- def __new__(cls, fileStoreID: str, *args: Any) -> "FileID":
31
+ def __new__(cls, fileStoreID: str, *args: Any, **kwargs: dict[str, Any]) -> "FileID":
32
32
  return super().__new__(cls, fileStoreID)
33
33
 
34
34
  def __init__(self, fileStoreID: str, size: int, executable: bool = False) -> None: