looper 1.5.0__py3-none-any.whl → 1.6.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- looper/__init__.py +3 -498
- looper/__main__.py +2 -2
- looper/_version.py +1 -1
- looper/cli_divvy.py +182 -0
- looper/cli_looper.py +776 -0
- looper/conductor.py +53 -206
- looper/const.py +51 -3
- looper/divvy.py +28 -196
- looper/exceptions.py +18 -0
- looper/looper.py +177 -612
- looper/plugins.py +160 -0
- looper/processed_project.py +1 -1
- looper/project.py +229 -117
- looper/utils.py +119 -43
- {looper-1.5.0.dist-info → looper-1.6.0a1.dist-info}/METADATA +6 -6
- {looper-1.5.0.dist-info → looper-1.6.0a1.dist-info}/RECORD +20 -20
- {looper-1.5.0.dist-info → looper-1.6.0a1.dist-info}/WHEEL +1 -1
- looper/html_reports.py +0 -1057
- looper/html_reports_pipestat.py +0 -924
- looper/html_reports_project_pipestat.py +0 -269
- {looper-1.5.0.dist-info → looper-1.6.0a1.dist-info}/LICENSE.txt +0 -0
- {looper-1.5.0.dist-info → looper-1.6.0a1.dist-info}/entry_points.txt +0 -0
- {looper-1.5.0.dist-info → looper-1.6.0a1.dist-info}/top_level.txt +0 -0
looper/looper.py
CHANGED
@@ -4,17 +4,12 @@ Looper: a pipeline submission engine. https://github.com/pepkit/looper
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
import abc
|
7
|
+
import argparse
|
7
8
|
import csv
|
8
9
|
import logging
|
9
10
|
import subprocess
|
10
|
-
import
|
11
|
-
|
12
|
-
if sys.version_info < (3, 3):
|
13
|
-
from collections import Mapping
|
14
|
-
else:
|
15
|
-
from collections.abc import Mapping
|
16
|
-
|
17
|
-
import logmuse
|
11
|
+
import yaml
|
12
|
+
import os
|
18
13
|
import pandas as _pd
|
19
14
|
|
20
15
|
# Need specific sequence of actions for colorama imports?
|
@@ -23,11 +18,12 @@ from colorama import init
|
|
23
18
|
init()
|
24
19
|
from shutil import rmtree
|
25
20
|
|
21
|
+
# from collections.abc import Mapping
|
22
|
+
from collections import defaultdict
|
26
23
|
from colorama import Fore, Style
|
27
|
-
from eido import
|
24
|
+
from eido import validate_config, validate_sample
|
28
25
|
from eido.exceptions import EidoValidationError
|
29
26
|
from jsonschema import ValidationError
|
30
|
-
from pephubclient import PEPHubClient
|
31
27
|
from peppy.const import *
|
32
28
|
from peppy.exceptions import RemoteYAMLError
|
33
29
|
from rich.color import Color
|
@@ -36,21 +32,20 @@ from rich.table import Table
|
|
36
32
|
from ubiquerg.cli_tools import query_yes_no
|
37
33
|
from ubiquerg.collection import uniqify
|
38
34
|
|
39
|
-
|
35
|
+
|
40
36
|
from .conductor import SubmissionConductor
|
37
|
+
|
38
|
+
from .exceptions import *
|
41
39
|
from .const import *
|
42
|
-
from .divvy import DEFAULT_COMPUTE_RESOURCES_NAME, select_divvy_config
|
43
|
-
from .exceptions import (
|
44
|
-
JobSubmissionException,
|
45
|
-
MisconfigurationException,
|
46
|
-
SampleFailedException,
|
47
|
-
)
|
48
|
-
from .html_reports import HTMLReportBuilderOld
|
49
|
-
from .html_reports_pipestat import HTMLReportBuilder, fetch_pipeline_results
|
50
|
-
from .html_reports_project_pipestat import HTMLReportBuilderProject
|
51
40
|
from .pipeline_interface import PipelineInterface
|
52
|
-
from .project import Project
|
53
|
-
from .utils import
|
41
|
+
from .project import Project
|
42
|
+
from .utils import (
|
43
|
+
desired_samples_range_skipped,
|
44
|
+
desired_samples_range_limited,
|
45
|
+
sample_folder,
|
46
|
+
)
|
47
|
+
from pipestat.reports import get_file_for_table
|
48
|
+
from pipestat.reports import get_file_for_project
|
54
49
|
|
55
50
|
_PKGNAME = "looper"
|
56
51
|
_LOGGER = logging.getLogger(_PKGNAME)
|
@@ -104,7 +99,7 @@ class Checker(Executor):
|
|
104
99
|
for sample in self.prj.samples:
|
105
100
|
psms = self.prj.get_pipestat_managers(sample_name=sample.sample_name)
|
106
101
|
for pipeline_name, psm in psms.items():
|
107
|
-
s = psm.get_status(
|
102
|
+
s = psm.get_status(record_identifier=sample.sample_name)
|
108
103
|
status.setdefault(pipeline_name, {})
|
109
104
|
status[pipeline_name][sample.sample_name] = s
|
110
105
|
_LOGGER.debug(f"{sample.sample_name} ({pipeline_name}): {s}")
|
@@ -171,60 +166,7 @@ class Checker(Executor):
|
|
171
166
|
desc = ""
|
172
167
|
table.add_row(status, desc)
|
173
168
|
console.print(table)
|
174
|
-
|
175
|
-
|
176
|
-
class CheckerOld(Executor):
|
177
|
-
def __call__(self, flags=None, all_folders=False, max_file_count=30):
|
178
|
-
"""
|
179
|
-
Check Project status, based on flag files.
|
180
|
-
|
181
|
-
:param Iterable[str] | str flags: Names of flags to check, optional;
|
182
|
-
if unspecified, all known flags will be checked.
|
183
|
-
:param bool all_folders: Whether to check flags in all folders, not
|
184
|
-
just those for samples in the config file from which the Project
|
185
|
-
was created.
|
186
|
-
:param int max_file_count: Maximum number of filepaths to display for a
|
187
|
-
given flag.
|
188
|
-
"""
|
189
|
-
|
190
|
-
# Handle single or multiple flags, and alphabetize.
|
191
|
-
flags = sorted([flags] if isinstance(flags, str) else list(flags or FLAGS))
|
192
|
-
flag_text = ", ".join(flags)
|
193
|
-
|
194
|
-
# Collect the files by flag and sort by flag name.
|
195
|
-
_LOGGER.debug("Checking project folders for flags: %s", flag_text)
|
196
|
-
if all_folders:
|
197
|
-
files_by_flag = fetch_flag_files(
|
198
|
-
results_folder=self.prj.results_folder, flags=flags
|
199
|
-
)
|
200
|
-
else:
|
201
|
-
files_by_flag = fetch_flag_files(prj=self.prj, flags=flags)
|
202
|
-
|
203
|
-
# For each flag, output occurrence count.
|
204
|
-
for flag in flags:
|
205
|
-
_LOGGER.info("%s: %d", flag.upper(), len(files_by_flag[flag]))
|
206
|
-
|
207
|
-
# For each flag, output filepath(s) if not overly verbose.
|
208
|
-
for flag in flags:
|
209
|
-
try:
|
210
|
-
files = files_by_flag[flag]
|
211
|
-
except Exception as e:
|
212
|
-
_LOGGER.debug(
|
213
|
-
"No files for {} flag. Caught exception: {}".format(
|
214
|
-
flags, getattr(e, "message", repr(e))
|
215
|
-
)
|
216
|
-
)
|
217
|
-
continue
|
218
|
-
# If checking on a specific flag, do not limit the number of
|
219
|
-
# reported filepaths, but do not report empty file lists
|
220
|
-
if len(flags) == 1 and len(files) > 0:
|
221
|
-
_LOGGER.info("%s (%d):\n%s", flag.upper(), len(files), "\n".join(files))
|
222
|
-
# Regardless of whether 0-count flags are previously reported,
|
223
|
-
# don't report an empty file list for a flag that's absent.
|
224
|
-
# If the flag-to-files mapping is defaultdict, absent flag (key)
|
225
|
-
# will fetch an empty collection, so check for length of 0.
|
226
|
-
if 0 < len(files) <= max_file_count:
|
227
|
-
_LOGGER.info("%s (%d):\n%s", flag.upper(), len(files), "\n".join(files))
|
169
|
+
return status
|
228
170
|
|
229
171
|
|
230
172
|
class Cleaner(Executor):
|
@@ -270,7 +212,8 @@ class Cleaner(Executor):
|
|
270
212
|
return self(args, preview_flag=False)
|
271
213
|
|
272
214
|
|
273
|
-
|
215
|
+
# NOTE: Adding type hint -> Iterable[Any] gives me TypeError: 'ABCMeta' object is not subscriptable
|
216
|
+
def select_samples(prj: Project, args: argparse.Namespace):
|
274
217
|
"""Use CLI limit/skip arguments to select subset of project's samples."""
|
275
218
|
# TODO: get proper element type for signature.
|
276
219
|
num_samples = len(prj.samples)
|
@@ -310,7 +253,17 @@ class Destroyer(Executor):
|
|
310
253
|
_remove_or_dry_run(sample_output_folder, args.dry_run)
|
311
254
|
|
312
255
|
_LOGGER.info("Removing summary:")
|
313
|
-
|
256
|
+
use_pipestat = (
|
257
|
+
self.prj.pipestat_configured_project
|
258
|
+
if args.project
|
259
|
+
else self.prj.pipestat_configured
|
260
|
+
)
|
261
|
+
if use_pipestat:
|
262
|
+
destroy_summary(self.prj, args.dry_run, args.project)
|
263
|
+
else:
|
264
|
+
_LOGGER.warning(
|
265
|
+
"Pipestat must be configured to destroy any created summaries."
|
266
|
+
)
|
314
267
|
|
315
268
|
if not preview_flag:
|
316
269
|
_LOGGER.info("Destroy complete.")
|
@@ -354,6 +307,7 @@ class Collator(Executor):
|
|
354
307
|
arguments, recognized by looper
|
355
308
|
"""
|
356
309
|
jobs = 0
|
310
|
+
self.debug = {}
|
357
311
|
project_pifaces = self.prj.project_pipeline_interface_sources
|
358
312
|
if not project_pifaces:
|
359
313
|
raise MisconfigurationException(
|
@@ -399,6 +353,8 @@ class Collator(Executor):
|
|
399
353
|
jobs += conductor.num_job_submissions
|
400
354
|
_LOGGER.info("\nLooper finished")
|
401
355
|
_LOGGER.info("Jobs submitted: {}".format(jobs))
|
356
|
+
self.debug[DEBUG_JOBS] = jobs
|
357
|
+
return self.debug
|
402
358
|
|
403
359
|
|
404
360
|
class Runner(Executor):
|
@@ -415,6 +371,7 @@ class Runner(Executor):
|
|
415
371
|
:param bool rerun: whether the given sample is being rerun rather than
|
416
372
|
run for the first time
|
417
373
|
"""
|
374
|
+
self.debug = {} # initialize empty dict for return values
|
418
375
|
max_cmds = sum(list(map(len, self.prj._samples_by_interface.values())))
|
419
376
|
self.counter.total = max_cmds
|
420
377
|
failures = defaultdict(list) # Collect problems by sample.
|
@@ -453,6 +410,9 @@ class Runner(Executor):
|
|
453
410
|
submission_conductors[piface.pipe_iface_file] = conductor
|
454
411
|
|
455
412
|
_LOGGER.info(f"Pipestat compatible: {self.prj.pipestat_configured_project}")
|
413
|
+
self.debug["Pipestat compatible"] = (
|
414
|
+
self.prj.pipestat_configured_project or self.prj.pipestat_configured
|
415
|
+
)
|
456
416
|
|
457
417
|
for sample in select_samples(prj=self.prj, args=args):
|
458
418
|
pl_fails = []
|
@@ -474,10 +434,17 @@ class Runner(Executor):
|
|
474
434
|
try:
|
475
435
|
validate_sample(self.prj, sample.sample_name, schema_file)
|
476
436
|
except EidoValidationError as e:
|
477
|
-
_LOGGER.error(
|
437
|
+
_LOGGER.error(
|
438
|
+
f"Short-circuiting due to validation error!\nSchema file: "
|
439
|
+
f"{schema_file}\nError: {e}\n{list(e.errors_by_type.keys())}"
|
440
|
+
)
|
441
|
+
self.debug[DEBUG_EIDO_VALIDATION] = (
|
442
|
+
f"Short-circuiting due to validation error!\nSchema file: "
|
443
|
+
f"{schema_file}\nError: {e}\n{list(e.errors_by_type.keys())}"
|
444
|
+
)
|
478
445
|
return False
|
479
446
|
except RemoteYAMLError:
|
480
|
-
_LOGGER.
|
447
|
+
_LOGGER.warning(
|
481
448
|
f"Could not read remote schema, skipping '{sample.sample_name}' "
|
482
449
|
f"sample validation against {schema_file}"
|
483
450
|
)
|
@@ -518,9 +485,15 @@ class Runner(Executor):
|
|
518
485
|
)
|
519
486
|
)
|
520
487
|
_LOGGER.info("Commands submitted: {} of {}".format(cmd_sub_total, max_cmds))
|
521
|
-
|
488
|
+
self.debug[DEBUG_COMMANDS] = "{} of {}".format(cmd_sub_total, max_cmds)
|
522
489
|
if args.dry_run:
|
523
|
-
|
490
|
+
job_sub_total_if_real = job_sub_total
|
491
|
+
job_sub_total = 0
|
492
|
+
_LOGGER.info(
|
493
|
+
f"Dry run. No jobs were actually submitted, but {job_sub_total_if_real} would have been."
|
494
|
+
)
|
495
|
+
_LOGGER.info("Jobs submitted: {}".format(job_sub_total))
|
496
|
+
self.debug[DEBUG_JOBS] = job_sub_total
|
524
497
|
|
525
498
|
# Restructure sample/failure data for display.
|
526
499
|
samples_by_reason = defaultdict(set)
|
@@ -528,6 +501,7 @@ class Runner(Executor):
|
|
528
501
|
for sample, failures in failures.items():
|
529
502
|
for f in failures:
|
530
503
|
samples_by_reason[f].add(sample)
|
504
|
+
self.debug[f] = sample
|
531
505
|
# Collect samples by pipeline with submission failure.
|
532
506
|
for piface, conductor in submission_conductors.items():
|
533
507
|
# Don't add failure key if there are no samples that failed for
|
@@ -562,6 +536,8 @@ class Runner(Executor):
|
|
562
536
|
_LOGGER.debug("Raising SampleFailedException")
|
563
537
|
raise SampleFailedException
|
564
538
|
|
539
|
+
return self.debug
|
540
|
+
|
565
541
|
|
566
542
|
class Reporter(Executor):
|
567
543
|
"""Combine project outputs into a browsable HTML report"""
|
@@ -576,305 +552,82 @@ class Reporter(Executor):
|
|
576
552
|
print(psms)
|
577
553
|
for name, psm in psms.items():
|
578
554
|
# Summarize will generate the static HTML Report Function
|
579
|
-
psm.summarize()
|
555
|
+
report_directory = psm.summarize(looper_samples=self.prj.samples)
|
556
|
+
print(f"Report directory: {report_directory}")
|
580
557
|
else:
|
581
|
-
for
|
582
|
-
|
558
|
+
for piface_source_samples in self.prj._samples_by_piface(
|
559
|
+
self.prj.piface_key
|
560
|
+
).values():
|
561
|
+
# For each piface_key, we have a list of samples, but we only need one sample from the list to
|
562
|
+
# call the related pipestat manager object which will pull ALL samples when using psm.summarize
|
563
|
+
first_sample_name = list(piface_source_samples)[0]
|
564
|
+
psms = self.prj.get_pipestat_managers(
|
565
|
+
sample_name=first_sample_name, project_level=False
|
566
|
+
)
|
583
567
|
print(psms)
|
584
568
|
for name, psm in psms.items():
|
585
569
|
# Summarize will generate the static HTML Report Function
|
586
|
-
psm.summarize()
|
570
|
+
report_directory = psm.summarize(looper_samples=self.prj.samples)
|
571
|
+
print(f"Report directory: {report_directory}")
|
587
572
|
|
588
573
|
|
589
|
-
class
|
590
|
-
"""
|
574
|
+
class Linker(Executor):
|
575
|
+
"""Create symlinks for reported results. Requires pipestat to be configured."""
|
591
576
|
|
592
577
|
def __call__(self, args):
|
578
|
+
# initialize the report builder
|
579
|
+
p = self.prj
|
593
580
|
project_level = args.project
|
581
|
+
link_dir = args.output_dir
|
582
|
+
|
594
583
|
if project_level:
|
595
|
-
|
596
|
-
for
|
597
|
-
|
598
|
-
|
599
|
-
# pull together all the fits and stats from each sample into
|
600
|
-
# project-combined spreadsheets.
|
601
|
-
self.stats = _create_stats_summary(
|
602
|
-
self.prj, pipeline_name, project_level, self.counter
|
603
|
-
)
|
604
|
-
self.objs = _create_obj_summary(
|
605
|
-
self.prj, pipeline_name, project_level, self.counter
|
606
|
-
)
|
584
|
+
psms = self.prj.get_pipestat_managers(project_level=True)
|
585
|
+
for name, psm in psms.items():
|
586
|
+
linked_results_path = psm.link(link_dir=link_dir)
|
587
|
+
print(f"Linked directory: {linked_results_path}")
|
607
588
|
else:
|
608
|
-
for
|
589
|
+
for piface_source_samples in self.prj._samples_by_piface(
|
609
590
|
self.prj.piface_key
|
610
|
-
).
|
611
|
-
#
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
self.prj, pipeline_name, project_level, self.counter
|
617
|
-
)
|
618
|
-
self.objs = _create_obj_summary(
|
619
|
-
self.prj, pipeline_name, project_level, self.counter
|
591
|
+
).values():
|
592
|
+
# For each piface_key, we have a list of samples, but we only need one sample from the list to
|
593
|
+
# call the related pipestat manager object which will pull ALL samples when using psm.summarize
|
594
|
+
first_sample_name = list(piface_source_samples)[0]
|
595
|
+
psms = self.prj.get_pipestat_managers(
|
596
|
+
sample_name=first_sample_name, project_level=False
|
620
597
|
)
|
621
|
-
|
598
|
+
for name, psm in psms.items():
|
599
|
+
linked_results_path = psm.link(link_dir=link_dir)
|
600
|
+
print(f"Linked directory: {linked_results_path}")
|
622
601
|
|
623
602
|
|
624
|
-
|
625
|
-
"""
|
626
|
-
Create stats spreadsheet and columns to be considered in the report, save
|
627
|
-
the spreadsheet to file
|
628
|
-
|
629
|
-
:param looper.Project project: the project to be summarized
|
630
|
-
:param str pipeline_name: name of the pipeline to tabulate results for
|
631
|
-
:param bool project_level: whether the project-level pipeline resutlts
|
632
|
-
should be tabulated
|
633
|
-
:param looper.LooperCounter counter: a counter object
|
634
|
-
"""
|
635
|
-
# Create stats_summary file
|
636
|
-
columns = set()
|
637
|
-
stats = []
|
638
|
-
_LOGGER.info("Creating stats summary")
|
639
|
-
if project_level:
|
640
|
-
_LOGGER.info(
|
641
|
-
counter.show(name=project.name, type="project", pipeline_name=pipeline_name)
|
642
|
-
)
|
643
|
-
reported_stats = {"project_name": project.name}
|
644
|
-
results = fetch_pipeline_results(
|
645
|
-
project=project,
|
646
|
-
pipeline_name=pipeline_name,
|
647
|
-
inclusion_fun=lambda x: x not in OBJECT_TYPES,
|
648
|
-
)
|
649
|
-
reported_stats.update(results)
|
650
|
-
stats.append(reported_stats)
|
651
|
-
columns |= set(reported_stats.keys())
|
603
|
+
class Tabulator(Executor):
|
604
|
+
"""Project/Sample statistics and table output generator
|
652
605
|
|
653
|
-
|
654
|
-
for sample in project.samples:
|
655
|
-
sn = sample.sample_name
|
656
|
-
_LOGGER.info(counter.show(sn, pipeline_name))
|
657
|
-
reported_stats = {project.sample_table_index: sn}
|
658
|
-
results = fetch_pipeline_results(
|
659
|
-
project=project,
|
660
|
-
pipeline_name=pipeline_name,
|
661
|
-
sample_name=sn,
|
662
|
-
inclusion_fun=lambda x: x not in OBJECT_TYPES,
|
663
|
-
)
|
664
|
-
reported_stats.update(results)
|
665
|
-
stats.append(reported_stats)
|
666
|
-
columns |= set(reported_stats.keys())
|
667
|
-
|
668
|
-
tsv_outfile_path = get_file_for_project(project, pipeline_name, "stats_summary.tsv")
|
669
|
-
tsv_outfile = open(tsv_outfile_path, "w")
|
670
|
-
tsv_writer = csv.DictWriter(
|
671
|
-
tsv_outfile, fieldnames=list(columns), delimiter="\t", extrasaction="ignore"
|
672
|
-
)
|
673
|
-
tsv_writer.writeheader()
|
674
|
-
for row in stats:
|
675
|
-
tsv_writer.writerow(row)
|
676
|
-
tsv_outfile.close()
|
677
|
-
_LOGGER.info(
|
678
|
-
f"'{pipeline_name}' pipeline stats summary (n={len(stats)}):"
|
679
|
-
f" {tsv_outfile_path}"
|
680
|
-
)
|
681
|
-
counter.reset()
|
682
|
-
return stats
|
683
|
-
|
684
|
-
|
685
|
-
def _create_obj_summary(project, pipeline_name, project_level, counter):
|
606
|
+
:return list[str|any] results: list containing output file paths of stats and objects
|
686
607
|
"""
|
687
|
-
Read sample specific objects files and save to a data frame
|
688
|
-
|
689
|
-
:param looper.Project project: the project to be summarized
|
690
|
-
:param str pipeline_name: name of the pipeline to tabulate results for
|
691
|
-
:param looper.LooperCounter counter: a counter object
|
692
|
-
:param bool project_level: whether the project-level pipeline resutlts
|
693
|
-
should be tabulated
|
694
|
-
"""
|
695
|
-
_LOGGER.info("Creating objects summary")
|
696
|
-
reported_objects = {}
|
697
|
-
if project_level:
|
698
|
-
_LOGGER.info(
|
699
|
-
counter.show(name=project.name, type="project", pipeline_name=pipeline_name)
|
700
|
-
)
|
701
|
-
res = fetch_pipeline_results(
|
702
|
-
project=project,
|
703
|
-
pipeline_name=pipeline_name,
|
704
|
-
inclusion_fun=lambda x: x in OBJECT_TYPES,
|
705
|
-
)
|
706
|
-
# need to cast to a dict, since other mapping-like objects might
|
707
|
-
# cause issues when writing to the collective yaml file below
|
708
|
-
project_reported_objects = {k: dict(v) for k, v in res.items()}
|
709
|
-
reported_objects[project.name] = project_reported_objects
|
710
|
-
else:
|
711
|
-
for sample in project.samples:
|
712
|
-
sn = sample.sample_name
|
713
|
-
_LOGGER.info(counter.show(sn, pipeline_name))
|
714
|
-
res = fetch_pipeline_results(
|
715
|
-
project=project,
|
716
|
-
pipeline_name=pipeline_name,
|
717
|
-
sample_name=sn,
|
718
|
-
inclusion_fun=lambda x: x in OBJECT_TYPES,
|
719
|
-
)
|
720
|
-
# need to cast to a dict, since other mapping-like objects might
|
721
|
-
# cause issues when writing to the collective yaml file below
|
722
|
-
sample_reported_objects = {k: dict(v) for k, v in res.items()}
|
723
|
-
reported_objects[sn] = sample_reported_objects
|
724
|
-
objs_yaml_path = get_file_for_project(project, pipeline_name, "objs_summary.yaml")
|
725
|
-
with open(objs_yaml_path, "w") as outfile:
|
726
|
-
yaml.dump(reported_objects, outfile)
|
727
|
-
_LOGGER.info(
|
728
|
-
f"'{pipeline_name}' pipeline objects summary "
|
729
|
-
f"(n={len(reported_objects.keys())}): {objs_yaml_path}"
|
730
|
-
)
|
731
|
-
counter.reset()
|
732
|
-
return reported_objects
|
733
|
-
|
734
|
-
|
735
|
-
class ReportOld(Executor):
|
736
|
-
"""Combine project outputs into a browsable HTML report"""
|
737
|
-
|
738
|
-
def __init__(self, prj):
|
739
|
-
# call the inherited initialization
|
740
|
-
super(ReportOld, self).__init__(prj)
|
741
|
-
self.prj = prj
|
742
608
|
|
743
609
|
def __call__(self, args):
|
744
|
-
#
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
super(TableOld, self).__init__(prj)
|
761
|
-
self.prj = prj
|
762
|
-
|
763
|
-
def __call__(self):
|
764
|
-
def _create_stats_summary_old(project, counter):
|
765
|
-
"""
|
766
|
-
Create stats spreadsheet and columns to be considered in the report, save
|
767
|
-
the spreadsheet to file
|
768
|
-
:param looper.Project project: the project to be summarized
|
769
|
-
:param looper.LooperCounter counter: a counter object
|
770
|
-
"""
|
771
|
-
# Create stats_summary file
|
772
|
-
columns = []
|
773
|
-
stats = []
|
774
|
-
project_samples = project.samples
|
775
|
-
missing_files = []
|
776
|
-
_LOGGER.info("Creating stats summary...")
|
777
|
-
for sample in project_samples:
|
778
|
-
_LOGGER.info(counter.show(sample.sample_name, sample.protocol))
|
779
|
-
sample_output_folder = sample_folder(project, sample)
|
780
|
-
# Grab the basic info from the annotation sheet for this sample.
|
781
|
-
# This will correspond to a row in the output.
|
782
|
-
sample_stats = sample.get_sheet_dict()
|
783
|
-
columns.extend(sample_stats.keys())
|
784
|
-
# Version 0.3 standardized all stats into a single file
|
785
|
-
stats_file = os.path.join(sample_output_folder, "stats.tsv")
|
786
|
-
if not os.path.isfile(stats_file):
|
787
|
-
missing_files.append(stats_file)
|
788
|
-
continue
|
789
|
-
t = _pd.read_csv(
|
790
|
-
stats_file, sep="\t", header=None, names=["key", "value", "pl"]
|
791
|
-
)
|
792
|
-
t.drop_duplicates(subset=["key", "pl"], keep="last", inplace=True)
|
793
|
-
t.loc[:, "plkey"] = t["pl"] + ":" + t["key"]
|
794
|
-
dupes = t.duplicated(subset=["key"], keep=False)
|
795
|
-
t.loc[dupes, "key"] = t.loc[dupes, "plkey"]
|
796
|
-
sample_stats.update(t.set_index("key")["value"].to_dict())
|
797
|
-
stats.append(sample_stats)
|
798
|
-
columns.extend(t.key.tolist())
|
799
|
-
if missing_files:
|
800
|
-
_LOGGER.warning(
|
801
|
-
"Stats files missing for {} samples: {}".format(
|
802
|
-
len(missing_files), missing_files
|
803
|
-
)
|
804
|
-
)
|
805
|
-
tsv_outfile_path = get_file_for_project_old(project, "stats_summary.tsv")
|
806
|
-
tsv_outfile = open(tsv_outfile_path, "w")
|
807
|
-
tsv_writer = csv.DictWriter(
|
808
|
-
tsv_outfile,
|
809
|
-
fieldnames=uniqify(columns),
|
810
|
-
delimiter="\t",
|
811
|
-
extrasaction="ignore",
|
812
|
-
)
|
813
|
-
tsv_writer.writeheader()
|
814
|
-
for row in stats:
|
815
|
-
tsv_writer.writerow(row)
|
816
|
-
tsv_outfile.close()
|
817
|
-
_LOGGER.info(
|
818
|
-
"Statistics summary (n=" + str(len(stats)) + "): " + tsv_outfile_path
|
819
|
-
)
|
820
|
-
counter.reset()
|
821
|
-
return stats, uniqify(columns)
|
822
|
-
|
823
|
-
def _create_obj_summary_old(project, counter):
|
824
|
-
"""
|
825
|
-
Read sample specific objects files and save to a data frame
|
826
|
-
:param looper.Project project: the project to be summarized
|
827
|
-
:param looper.LooperCounter counter: a counter object
|
828
|
-
:return pandas.DataFrame: objects spreadsheet
|
829
|
-
"""
|
830
|
-
_LOGGER.info("Creating objects summary...")
|
831
|
-
objs = _pd.DataFrame()
|
832
|
-
# Create objects summary file
|
833
|
-
missing_files = []
|
834
|
-
for sample in project.samples:
|
835
|
-
# Process any reported objects
|
836
|
-
_LOGGER.info(counter.show(sample.sample_name, sample.protocol))
|
837
|
-
sample_output_folder = sample_folder(project, sample)
|
838
|
-
objs_file = os.path.join(sample_output_folder, "objects.tsv")
|
839
|
-
if not os.path.isfile(objs_file):
|
840
|
-
missing_files.append(objs_file)
|
841
|
-
continue
|
842
|
-
t = _pd.read_csv(
|
843
|
-
objs_file,
|
844
|
-
sep="\t",
|
845
|
-
header=None,
|
846
|
-
names=[
|
847
|
-
"key",
|
848
|
-
"filename",
|
849
|
-
"anchor_text",
|
850
|
-
"anchor_image",
|
851
|
-
"annotation",
|
852
|
-
],
|
853
|
-
)
|
854
|
-
t["sample_name"] = sample.sample_name
|
855
|
-
objs = objs.append(t, ignore_index=True)
|
856
|
-
if missing_files:
|
857
|
-
_LOGGER.warning(
|
858
|
-
"Object files missing for {} samples: {}".format(
|
859
|
-
len(missing_files), missing_files
|
860
|
-
)
|
610
|
+
# p = self.prj
|
611
|
+
project_level = args.project
|
612
|
+
results = []
|
613
|
+
if project_level:
|
614
|
+
psms = self.prj.get_pipestat_managers(project_level=True)
|
615
|
+
for name, psm in psms.items():
|
616
|
+
results = psm.table()
|
617
|
+
else:
|
618
|
+
for piface_source_samples in self.prj._samples_by_piface(
|
619
|
+
self.prj.piface_key
|
620
|
+
).values():
|
621
|
+
# For each piface_key, we have a list of samples, but we only need one sample from the list to
|
622
|
+
# call the related pipestat manager object which will pull ALL samples when using psm.table
|
623
|
+
first_sample_name = list(piface_source_samples)[0]
|
624
|
+
psms = self.prj.get_pipestat_managers(
|
625
|
+
sample_name=first_sample_name, project_level=False
|
861
626
|
)
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
"Objects summary (n="
|
867
|
-
+ str(len(project.samples) - len(missing_files))
|
868
|
-
+ "): "
|
869
|
-
+ objs_file
|
870
|
-
)
|
871
|
-
return objs
|
872
|
-
|
873
|
-
# pull together all the fits and stats from each sample into
|
874
|
-
# project-combined spreadsheets.
|
875
|
-
self.stats, self.columns = _create_stats_summary_old(self.prj, self.counter)
|
876
|
-
self.objs = _create_obj_summary_old(self.prj, self.counter)
|
877
|
-
return self
|
627
|
+
for name, psm in psms.items():
|
628
|
+
results = psm.table()
|
629
|
+
# Results contains paths to stats and object summaries.
|
630
|
+
return results
|
878
631
|
|
879
632
|
|
880
633
|
def _create_failure_message(reason, samples):
|
@@ -889,7 +642,7 @@ def _remove_or_dry_run(paths, dry_run=False):
|
|
889
642
|
|
890
643
|
:param list|str paths: list of paths to files/dirs to be removed
|
891
644
|
:param bool dry_run: logical indicating whether the files should remain
|
892
|
-
untouched and
|
645
|
+
untouched and message printed
|
893
646
|
"""
|
894
647
|
paths = paths if isinstance(paths, list) else [paths]
|
895
648
|
for path in paths:
|
@@ -906,20 +659,70 @@ def _remove_or_dry_run(paths, dry_run=False):
|
|
906
659
|
_LOGGER.info(path + " does not exist.")
|
907
660
|
|
908
661
|
|
909
|
-
def destroy_summary(prj, dry_run=False):
|
662
|
+
def destroy_summary(prj, dry_run=False, project_level=False):
|
910
663
|
"""
|
911
664
|
Delete the summary files if not in dry run mode
|
665
|
+
This function is for use with pipestat configured projects.
|
912
666
|
"""
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
667
|
+
|
668
|
+
if project_level:
|
669
|
+
psms = prj.get_pipestat_managers(project_level=True)
|
670
|
+
for name, psm in psms.items():
|
671
|
+
_remove_or_dry_run(
|
672
|
+
[
|
673
|
+
get_file_for_project(
|
674
|
+
psm,
|
675
|
+
pipeline_name=psm["_pipeline_name"],
|
676
|
+
directory="reports",
|
677
|
+
),
|
678
|
+
get_file_for_table(
|
679
|
+
psm,
|
680
|
+
pipeline_name=psm["_pipeline_name"],
|
681
|
+
appendix="stats_summary.tsv",
|
682
|
+
),
|
683
|
+
get_file_for_table(
|
684
|
+
psm,
|
685
|
+
pipeline_name=psm["_pipeline_name"],
|
686
|
+
appendix="objs_summary.yaml",
|
687
|
+
),
|
688
|
+
get_file_for_table(
|
689
|
+
psm, pipeline_name=psm["_pipeline_name"], appendix="reports"
|
690
|
+
),
|
691
|
+
],
|
692
|
+
dry_run,
|
693
|
+
)
|
694
|
+
else:
|
695
|
+
for piface_source_samples in prj._samples_by_piface(prj.piface_key).values():
|
696
|
+
# For each piface_key, we have a list of samples, but we only need one sample from the list to
|
697
|
+
# call the related pipestat manager object which will pull ALL samples when using psm.table
|
698
|
+
first_sample_name = list(piface_source_samples)[0]
|
699
|
+
psms = prj.get_pipestat_managers(
|
700
|
+
sample_name=first_sample_name, project_level=False
|
701
|
+
)
|
702
|
+
for name, psm in psms.items():
|
703
|
+
_remove_or_dry_run(
|
704
|
+
[
|
705
|
+
get_file_for_project(
|
706
|
+
psm,
|
707
|
+
pipeline_name=psm["_pipeline_name"],
|
708
|
+
directory="reports",
|
709
|
+
),
|
710
|
+
get_file_for_table(
|
711
|
+
psm,
|
712
|
+
pipeline_name=psm["_pipeline_name"],
|
713
|
+
appendix="stats_summary.tsv",
|
714
|
+
),
|
715
|
+
get_file_for_table(
|
716
|
+
psm,
|
717
|
+
pipeline_name=psm["_pipeline_name"],
|
718
|
+
appendix="objs_summary.yaml",
|
719
|
+
),
|
720
|
+
get_file_for_table(
|
721
|
+
psm, pipeline_name=psm["_pipeline_name"], appendix="reports"
|
722
|
+
),
|
723
|
+
],
|
724
|
+
dry_run,
|
725
|
+
)
|
923
726
|
|
924
727
|
|
925
728
|
class LooperCounter(object):
|
@@ -972,241 +775,3 @@ def _submission_status_text(
|
|
972
775
|
if pipeline_name:
|
973
776
|
txt += f"; pipeline: {pipeline_name}"
|
974
777
|
return txt + Style.RESET_ALL
|
975
|
-
|
976
|
-
|
977
|
-
def _proc_resources_spec(args):
|
978
|
-
"""
|
979
|
-
Process CLI-sources compute setting specification. There are two sources
|
980
|
-
of compute settings in the CLI alone:
|
981
|
-
* YAML file (--settings argument)
|
982
|
-
* itemized compute settings (--compute argument)
|
983
|
-
|
984
|
-
The itemized compute specification is given priority
|
985
|
-
|
986
|
-
:param argparse.Namespace: arguments namespace
|
987
|
-
:return Mapping[str, str]: binding between resource setting name and value
|
988
|
-
:raise ValueError: if interpretation of the given specification as encoding
|
989
|
-
of key-value pairs fails
|
990
|
-
"""
|
991
|
-
spec = getattr(args, "compute", None)
|
992
|
-
try:
|
993
|
-
settings_data = read_yaml_file(args.settings) or {}
|
994
|
-
except yaml.YAMLError:
|
995
|
-
_LOGGER.warning(
|
996
|
-
"Settings file ({}) does not follow YAML format,"
|
997
|
-
" disregarding".format(args.settings)
|
998
|
-
)
|
999
|
-
settings_data = {}
|
1000
|
-
if not spec:
|
1001
|
-
return settings_data
|
1002
|
-
pairs = [(kv, kv.split("=")) for kv in spec]
|
1003
|
-
bads = []
|
1004
|
-
for orig, pair in pairs:
|
1005
|
-
try:
|
1006
|
-
k, v = pair
|
1007
|
-
except ValueError:
|
1008
|
-
bads.append(orig)
|
1009
|
-
else:
|
1010
|
-
settings_data[k] = v
|
1011
|
-
if bads:
|
1012
|
-
raise ValueError(
|
1013
|
-
"Could not correctly parse itemized compute specification. "
|
1014
|
-
"Correct format: " + EXAMPLE_COMPUTE_SPEC_FMT
|
1015
|
-
)
|
1016
|
-
return settings_data
|
1017
|
-
|
1018
|
-
|
1019
|
-
def main(test_args=None):
|
1020
|
-
"""Primary workflow"""
|
1021
|
-
global _LOGGER
|
1022
|
-
|
1023
|
-
parser, aux_parser = build_parser()
|
1024
|
-
aux_parser.suppress_defaults()
|
1025
|
-
|
1026
|
-
if test_args:
|
1027
|
-
args, remaining_args = parser.parse_known_args(args=test_args)
|
1028
|
-
else:
|
1029
|
-
args, remaining_args = parser.parse_known_args()
|
1030
|
-
|
1031
|
-
cli_use_errors = validate_post_parse(args)
|
1032
|
-
if cli_use_errors:
|
1033
|
-
parser.print_help(sys.stderr)
|
1034
|
-
parser.error(
|
1035
|
-
f"{len(cli_use_errors)} CLI use problem(s): {', '.join(cli_use_errors)}"
|
1036
|
-
)
|
1037
|
-
if args.command is None:
|
1038
|
-
parser.print_help(sys.stderr)
|
1039
|
-
sys.exit(1)
|
1040
|
-
if "config_file" in vars(args):
|
1041
|
-
if args.config_file is None:
|
1042
|
-
msg = "No project config defined (peppy)"
|
1043
|
-
try:
|
1044
|
-
if args.looper_config:
|
1045
|
-
looper_config_dict = read_looper_config_file(args.looper_config)
|
1046
|
-
else:
|
1047
|
-
looper_config_dict = read_looper_dotfile()
|
1048
|
-
print(
|
1049
|
-
msg + f", using: {read_looper_dotfile()}. "
|
1050
|
-
f"Read from dotfile ({dotfile_path()})."
|
1051
|
-
)
|
1052
|
-
|
1053
|
-
for looper_config_key, looper_config_item in looper_config_dict.items():
|
1054
|
-
setattr(args, looper_config_key, looper_config_item)
|
1055
|
-
|
1056
|
-
except OSError:
|
1057
|
-
print(msg + f" and dotfile does not exist: {dotfile_path()}")
|
1058
|
-
parser.print_help(sys.stderr)
|
1059
|
-
sys.exit(1)
|
1060
|
-
else:
|
1061
|
-
_LOGGER.warning(
|
1062
|
-
"The Looper config specification through the PEP project is deprecated and will "
|
1063
|
-
"be removed in future versions. Please use the new running method by "
|
1064
|
-
f"utilizing a looper config file. For more information: {'here is more information'} "
|
1065
|
-
)
|
1066
|
-
|
1067
|
-
if args.command == "init":
|
1068
|
-
sys.exit(
|
1069
|
-
int(
|
1070
|
-
not init_dotfile(
|
1071
|
-
dotfile_path(),
|
1072
|
-
args.config_file,
|
1073
|
-
args.output_dir,
|
1074
|
-
args.sample_pipeline_interfaces,
|
1075
|
-
args.project_pipeline_interfaces,
|
1076
|
-
args.force,
|
1077
|
-
)
|
1078
|
-
)
|
1079
|
-
)
|
1080
|
-
|
1081
|
-
if args.command == "init-piface":
|
1082
|
-
sys.exit(int(not init_generic_pipeline()))
|
1083
|
-
|
1084
|
-
args = enrich_args_via_cfg(args, aux_parser, test_args)
|
1085
|
-
|
1086
|
-
# If project pipeline interface defined in the cli, change name to: "pipeline_interface"
|
1087
|
-
if vars(args)[PROJECT_PL_ARG]:
|
1088
|
-
args.pipeline_interfaces = vars(args)[PROJECT_PL_ARG]
|
1089
|
-
|
1090
|
-
_LOGGER = logmuse.logger_via_cli(args, make_root=True)
|
1091
|
-
|
1092
|
-
_LOGGER.info("Looper version: {}\nCommand: {}".format(__version__, args.command))
|
1093
|
-
|
1094
|
-
if len(remaining_args) > 0:
|
1095
|
-
_LOGGER.warning(
|
1096
|
-
"Unrecognized arguments: {}".format(
|
1097
|
-
" ".join([str(x) for x in remaining_args])
|
1098
|
-
)
|
1099
|
-
)
|
1100
|
-
|
1101
|
-
divcfg = (
|
1102
|
-
select_divvy_config(filepath=args.divvy) if hasattr(args, "divvy") else None
|
1103
|
-
)
|
1104
|
-
|
1105
|
-
# Initialize project
|
1106
|
-
if is_registry_path(args.config_file):
|
1107
|
-
if vars(args)[SAMPLE_PL_ARG]:
|
1108
|
-
p = Project(
|
1109
|
-
amendments=args.amend,
|
1110
|
-
divcfg_path=divcfg,
|
1111
|
-
runp=args.command == "runp",
|
1112
|
-
project_dict=PEPHubClient()._load_raw_pep(
|
1113
|
-
registry_path=args.config_file
|
1114
|
-
),
|
1115
|
-
**{
|
1116
|
-
attr: getattr(args, attr) for attr in CLI_PROJ_ATTRS if attr in args
|
1117
|
-
},
|
1118
|
-
)
|
1119
|
-
else:
|
1120
|
-
raise MisconfigurationException(
|
1121
|
-
f"`sample_pipeline_interface` is missing. Provide it in the parameters."
|
1122
|
-
)
|
1123
|
-
else:
|
1124
|
-
try:
|
1125
|
-
p = Project(
|
1126
|
-
cfg=args.config_file,
|
1127
|
-
amendments=args.amend,
|
1128
|
-
divcfg_path=divcfg,
|
1129
|
-
runp=args.command == "runp",
|
1130
|
-
**{
|
1131
|
-
attr: getattr(args, attr) for attr in CLI_PROJ_ATTRS if attr in args
|
1132
|
-
},
|
1133
|
-
)
|
1134
|
-
except yaml.parser.ParserError as e:
|
1135
|
-
_LOGGER.error(f"Project config parse failed -- {e}")
|
1136
|
-
sys.exit(1)
|
1137
|
-
|
1138
|
-
selected_compute_pkg = p.selected_compute_package or DEFAULT_COMPUTE_RESOURCES_NAME
|
1139
|
-
if p.dcc is not None and not p.dcc.activate_package(selected_compute_pkg):
|
1140
|
-
_LOGGER.info(
|
1141
|
-
"Failed to activate '{}' computing package. "
|
1142
|
-
"Using the default one".format(selected_compute_pkg)
|
1143
|
-
)
|
1144
|
-
|
1145
|
-
with ProjectContext(
|
1146
|
-
prj=p,
|
1147
|
-
selector_attribute=args.sel_attr,
|
1148
|
-
selector_include=args.sel_incl,
|
1149
|
-
selector_exclude=args.sel_excl,
|
1150
|
-
) as prj:
|
1151
|
-
if args.command in ["run", "rerun"]:
|
1152
|
-
run = Runner(prj)
|
1153
|
-
try:
|
1154
|
-
compute_kwargs = _proc_resources_spec(args)
|
1155
|
-
run(args, rerun=(args.command == "rerun"), **compute_kwargs)
|
1156
|
-
except SampleFailedException:
|
1157
|
-
sys.exit(1)
|
1158
|
-
except IOError:
|
1159
|
-
_LOGGER.error(
|
1160
|
-
"{} pipeline_interfaces: '{}'".format(
|
1161
|
-
prj.__class__.__name__, prj.pipeline_interface_sources
|
1162
|
-
)
|
1163
|
-
)
|
1164
|
-
raise
|
1165
|
-
|
1166
|
-
if args.command == "runp":
|
1167
|
-
compute_kwargs = _proc_resources_spec(args)
|
1168
|
-
collate = Collator(prj)
|
1169
|
-
collate(args, **compute_kwargs)
|
1170
|
-
|
1171
|
-
if args.command == "destroy":
|
1172
|
-
return Destroyer(prj)(args)
|
1173
|
-
|
1174
|
-
# pipestat support introduces breaking changes and pipelines run
|
1175
|
-
# with no pipestat reporting would not be compatible with
|
1176
|
-
# commands: table, report and check. Therefore we plan maintain
|
1177
|
-
# the old implementations for a couple of releases.
|
1178
|
-
if hasattr(args, "project"):
|
1179
|
-
use_pipestat = (
|
1180
|
-
prj.pipestat_configured_project
|
1181
|
-
if args.project
|
1182
|
-
else prj.pipestat_configured
|
1183
|
-
)
|
1184
|
-
if args.command == "table":
|
1185
|
-
if use_pipestat:
|
1186
|
-
Tabulator(prj)(args)
|
1187
|
-
else:
|
1188
|
-
TableOld(prj)()
|
1189
|
-
|
1190
|
-
if args.command == "report":
|
1191
|
-
if use_pipestat:
|
1192
|
-
Reporter(prj)(args)
|
1193
|
-
else:
|
1194
|
-
ReportOld(prj)(args)
|
1195
|
-
|
1196
|
-
if args.command == "check":
|
1197
|
-
if use_pipestat:
|
1198
|
-
Checker(prj)(args)
|
1199
|
-
else:
|
1200
|
-
CheckerOld(prj)(flags=args.flags)
|
1201
|
-
|
1202
|
-
if args.command == "clean":
|
1203
|
-
return Cleaner(prj)(args)
|
1204
|
-
|
1205
|
-
if args.command == "inspect":
|
1206
|
-
inspect_project(p, args.sample_names, args.attr_limit)
|
1207
|
-
from warnings import warn
|
1208
|
-
|
1209
|
-
warn(
|
1210
|
-
"The inspect feature has moved to eido and will be removed in the future release of looper. "
|
1211
|
-
"Use `eido inspect` from now on.",
|
1212
|
-
)
|