biopipen 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +6 -0
- biopipen/core/filters.py +35 -23
- biopipen/core/testing.py +6 -1
- biopipen/ns/bam.py +39 -0
- biopipen/ns/cellranger.py +5 -0
- biopipen/ns/cellranger_pipeline.py +2 -2
- biopipen/ns/cnvkit_pipeline.py +4 -1
- biopipen/ns/delim.py +33 -27
- biopipen/ns/protein.py +99 -0
- biopipen/ns/scrna.py +428 -250
- biopipen/ns/snp.py +16 -3
- biopipen/ns/tcr.py +125 -1
- biopipen/ns/vcf.py +34 -0
- biopipen/ns/web.py +5 -1
- biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
- biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
- biopipen/reports/tcr/ClonalStats.svelte +15 -0
- biopipen/reports/utils/misc.liq +20 -7
- biopipen/scripts/bam/BamMerge.py +2 -2
- biopipen/scripts/bam/BamSampling.py +4 -4
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +3 -3
- biopipen/scripts/bam/CNVpytor.py +10 -10
- biopipen/scripts/bam/ControlFREEC.py +11 -11
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
- biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +20 -9
- biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
- biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
- biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/SampleInfo.R +94 -148
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +4 -4
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifScan.py +8 -8
- biopipen/scripts/scrna/CellCellCommunication.py +59 -22
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MarkersFinder.R +273 -654
- biopipen/scripts/scrna/RadarPlots.R +73 -53
- biopipen/scripts/scrna/SCP-plot.R +15202 -0
- biopipen/scripts/scrna/ScVelo.py +0 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
- biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
- biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
- biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
- biopipen/scripts/scrna/SeuratPreparing.R +138 -81
- biopipen/scripts/scrna/SlingShot.R +71 -0
- biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
- biopipen/scripts/snp/Plink2GTMat.py +26 -11
- biopipen/scripts/snp/PlinkFilter.py +7 -7
- biopipen/scripts/snp/PlinkFromVcf.py +8 -5
- biopipen/scripts/snp/PlinkSimulation.py +4 -4
- biopipen/scripts/snp/PlinkUpdateName.py +4 -4
- biopipen/scripts/stats/ChowTest.R +48 -22
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/ClonalStats.R +484 -0
- biopipen/scripts/tcr/ScRepLoading.R +127 -0
- biopipen/scripts/tcr/TCRDock.py +10 -6
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
- biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +4 -4
- biopipen/scripts/vcf/BcftoolsView.py +5 -5
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +12 -3
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +3 -3
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
- biopipen/scripts/web/gcloud_common.py +1 -1
- biopipen/utils/gsea.R +75 -35
- biopipen/utils/misc.R +205 -7
- biopipen/utils/misc.py +17 -8
- biopipen/utils/reference.py +11 -11
- biopipen/utils/repr.R +146 -0
- biopipen/utils/vcf.py +1 -1
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/METADATA +8 -8
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/RECORD +115 -105
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/WHEEL +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
- biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/entry_points.txt +0 -0
|
@@ -4,9 +4,9 @@ from diot import Diot
|
|
|
4
4
|
|
|
5
5
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
6
6
|
|
|
7
|
-
segfiles = {{in.segfiles | repr}} # pyright: ignore # noqa
|
|
7
|
+
segfiles = {{in.segfiles | repr}} # pyright: ignore # noqa # noqa
|
|
8
8
|
sample_sex = {{in.sample_sex | repr}} # pyright: ignore
|
|
9
|
-
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
9
|
+
outdir: str = {{out.outdir | repr}} # pyright: ignore
|
|
10
10
|
cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
|
|
11
11
|
convert = {{envs.convert | quote}} # pyright: ignore
|
|
12
12
|
convert_args = {{envs.convert_args | repr}} # pyright: ignore
|
|
@@ -16,7 +16,7 @@ desaturate= {{ envs.desaturate | repr}} # pyright: ignore
|
|
|
16
16
|
male_reference= {{ envs.male_reference | repr}} # pyright: ignore
|
|
17
17
|
no_shift_xy= {{ envs.no_shift_xy | repr}} # pyright: ignore
|
|
18
18
|
order = {{envs.order | repr}} # pyright: ignore
|
|
19
|
-
cases = {{envs.cases | repr}} # pyright: ignore
|
|
19
|
+
cases: dict | None = {{envs.cases | repr}} # pyright: ignore
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def parse_order(files, orderfile):
|
|
@@ -70,7 +70,7 @@ def do_case(name, case):
|
|
|
70
70
|
args[""] = [cnvkit, "heatmap"]
|
|
71
71
|
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
72
72
|
|
|
73
|
-
conv_args = dict(**conv_args, _=[pdffile, pngfile])
|
|
73
|
+
conv_args: dict = dict(**conv_args, _=[pdffile, pngfile])
|
|
74
74
|
conv_args[""] = [convert]
|
|
75
75
|
run_command(
|
|
76
76
|
dict_to_cli_args(conv_args, dashify=True, prefix="-"),
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
3
3
|
|
|
4
|
-
covfiles = {{in.covfiles | repr}} # pyright: ignore
|
|
4
|
+
covfiles = {{in.covfiles | repr}} # pyright: ignore # noqa
|
|
5
5
|
target_file = {{in.target_file | repr}} # pyright: ignore
|
|
6
6
|
antitarget_file = {{in.antitarget_file | repr}} # pyright: ignore
|
|
7
7
|
sample_sex = {{in.sample_sex | repr}} # pyright: ignore
|
|
8
8
|
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
9
|
-
reffile = {{envs.ref |
|
|
9
|
+
reffile: str = {{envs.ref | quote}} # pyright: ignore
|
|
10
10
|
cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
|
|
11
11
|
cluster = {{envs.cluster | repr}} # pyright: ignore
|
|
12
12
|
min_cluster_size = {{envs.min_cluster_size | repr}} # pyright: ignore
|
|
@@ -4,14 +4,14 @@ from diot import Diot
|
|
|
4
4
|
|
|
5
5
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
6
6
|
|
|
7
|
-
cnrfile = {{in.cnrfile | quote}} # pyright: ignore
|
|
7
|
+
cnrfile = {{in.cnrfile | quote}} # pyright: ignore # noqa
|
|
8
8
|
cnsfile = {{in.cnsfile | quote}} # pyright: ignore
|
|
9
9
|
convert = {{envs.convert | quote}} # pyright: ignore
|
|
10
10
|
convert_args = {{envs.convert_args | repr}} # pyright: ignore
|
|
11
11
|
vcf = {{in.vcf | repr}} # pyright: ignore
|
|
12
12
|
sample_id = {{in.sample_id | repr}} # pyright: ignore
|
|
13
13
|
normal_id = {{in.normal_id | repr}} # pyright: ignore
|
|
14
|
-
outdir = {{out.outdir | quote}} # pyright: ignore
|
|
14
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
15
15
|
cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
|
|
16
16
|
chromosome = {{envs.chromosome | repr}} # pyright: ignore
|
|
17
17
|
gene = {{envs.gene | repr}} # pyright: ignore
|
|
@@ -25,7 +25,7 @@ y_min = {{envs.y_min | repr}} # pyright: ignore
|
|
|
25
25
|
min_variant_depth = {{envs.min_variant_depth | repr}} # pyright: ignore
|
|
26
26
|
zygosity_freq = {{envs.zygosity_freq | repr}} # pyright: ignore
|
|
27
27
|
title = {{envs.title | repr}} # pyright: ignore
|
|
28
|
-
cases = {{envs.cases | repr}} # pyright: ignore
|
|
28
|
+
cases: dict | None = {{envs.cases | repr}} # pyright: ignore
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def do_case(name, case):
|
|
@@ -50,7 +50,7 @@ def do_case(name, case):
|
|
|
50
50
|
pdffile = Path(outdir).joinpath(f"{name}.heatmap.pdf")
|
|
51
51
|
pngfile = Path(outdir).joinpath(f"{name}.heatmap.png")
|
|
52
52
|
|
|
53
|
-
args = dict(
|
|
53
|
+
args: dict = dict(
|
|
54
54
|
**case,
|
|
55
55
|
s=cnsfile,
|
|
56
56
|
o=pdffile,
|
|
@@ -62,7 +62,7 @@ def do_case(name, case):
|
|
|
62
62
|
args[""] = [cnvkit, "scatter"]
|
|
63
63
|
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
64
64
|
|
|
65
|
-
conv_args = dict(**conv_args, _=[pdffile, pngfile])
|
|
65
|
+
conv_args: dict = dict(**conv_args, _=[pdffile, pngfile])
|
|
66
66
|
conv_args[""] = [convert]
|
|
67
67
|
run_command(
|
|
68
68
|
dict_to_cli_args(conv_args, dashify=True, prefix="-"),
|
|
@@ -2,11 +2,11 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
4
4
|
|
|
5
|
-
cnrfile = {{in.cnrfile | quote}} # pyright: ignore
|
|
5
|
+
cnrfile = {{in.cnrfile | quote}} # pyright: ignore # noqa
|
|
6
6
|
vcf = {{in.vcf | repr}} # pyright: ignore
|
|
7
7
|
sample_id = {{in.sample_id | repr}} # pyright: ignore
|
|
8
8
|
normal_id = {{in.normal_id | repr}} # pyright: ignore
|
|
9
|
-
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
9
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
10
10
|
cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
|
|
11
11
|
method = {{envs.method | quote}} # pyright: ignore
|
|
12
12
|
threshold = {{envs.threshold | repr}} # pyright: ignore
|
|
@@ -21,7 +21,7 @@ zygosity_freq = {{envs.zygosity_freq | repr}} # pyright: ignore
|
|
|
21
21
|
|
|
22
22
|
def main():
|
|
23
23
|
|
|
24
|
-
args = dict(
|
|
24
|
+
args: dict = dict(
|
|
25
25
|
o=outfile,
|
|
26
26
|
d=Path(outfile).parent / "intermediate.rds",
|
|
27
27
|
m=method,
|
|
@@ -39,8 +39,8 @@ def main():
|
|
|
39
39
|
_=cnrfile,
|
|
40
40
|
)
|
|
41
41
|
args[""] = [cnvkit, "segment"]
|
|
42
|
-
|
|
43
|
-
run_command(
|
|
42
|
+
cmd_args = dict_to_cli_args(args, dashify=True)
|
|
43
|
+
run_command(cmd_args, fg=True)
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
if __name__ == "__main__":
|
|
@@ -25,10 +25,10 @@ import sys
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
import pandas as pd
|
|
27
27
|
|
|
28
|
-
import cnvlib
|
|
29
|
-
from cnvlib import parallel
|
|
30
|
-
from cnvlib.descriptives import modal_location
|
|
31
|
-
from skgenome import tabio, GenomicArray as GA
|
|
28
|
+
import cnvlib # type: ignore
|
|
29
|
+
from cnvlib import parallel # type: ignore
|
|
30
|
+
from cnvlib.descriptives import modal_location # type: ignore
|
|
31
|
+
from skgenome import tabio, GenomicArray as GA # type: ignore
|
|
32
32
|
|
|
33
33
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
34
34
|
|
|
@@ -36,11 +36,12 @@ logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
|
36
36
|
# ___________________________________________
|
|
37
37
|
# Guided method: guess from potential targets
|
|
38
38
|
|
|
39
|
+
|
|
39
40
|
def filter_targets(target_bed, sample_bams, procs, fasta):
|
|
40
41
|
"""Check if each potential target has significant coverage."""
|
|
41
42
|
try:
|
|
42
|
-
baits = tabio.read(target_bed,
|
|
43
|
-
except:
|
|
43
|
+
baits = tabio.read(target_bed, "bed4")
|
|
44
|
+
except: # noqa
|
|
44
45
|
raise RuntimeError("Targets must be in BED format; try skg_convert.py")
|
|
45
46
|
logging.info("Loaded %d candidate regions from %s", len(baits), target_bed)
|
|
46
47
|
# Loop over BAMs to calculate weighted averages of bin coverage depths
|
|
@@ -48,47 +49,46 @@ def filter_targets(target_bed, sample_bams, procs, fasta):
|
|
|
48
49
|
for bam_fname in sample_bams:
|
|
49
50
|
logging.info("Evaluating targets in %s", bam_fname)
|
|
50
51
|
sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs, fasta=fasta)
|
|
51
|
-
assert len(sample) == len(baits),
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
baits[
|
|
55
|
-
logging.info("Average candidate-target depth:\n%s",
|
|
56
|
-
baits['depth'].describe())
|
|
52
|
+
assert len(sample) == len(baits), "%d != %d" % (len(sample), len(baits))
|
|
53
|
+
total_depths += sample["depth"].values
|
|
54
|
+
baits["depth"] = total_depths / len(sample_bams)
|
|
55
|
+
logging.info("Average candidate-target depth:\n%s", baits["depth"].describe())
|
|
57
56
|
return baits
|
|
58
57
|
|
|
59
58
|
|
|
60
59
|
# _________________________________________
|
|
61
60
|
# Unguided method: guess from raw depths
|
|
62
61
|
|
|
63
|
-
|
|
64
|
-
|
|
62
|
+
|
|
63
|
+
def scan_targets(access_bed, sample_bams, min_depth, min_gap, min_length, procs):
|
|
65
64
|
"""Estimate baited regions from a genome-wide, per-base depth profile."""
|
|
66
65
|
bait_chunks = []
|
|
67
66
|
# ENH: context manager to call rm on bed chunks? with to_chunks as pool, ck?
|
|
68
|
-
logging.info("Scanning for enriched regions in:\n %s",
|
|
69
|
-
'\n '.join(sample_bams))
|
|
67
|
+
logging.info("Scanning for enriched regions in:\n %s", "\n ".join(sample_bams))
|
|
70
68
|
# with futures.ProcessPoolExecutor(procs) as pool:
|
|
71
69
|
with parallel.pick_pool(procs) as pool:
|
|
72
|
-
args_iter = (
|
|
73
|
-
|
|
74
|
-
|
|
70
|
+
args_iter = (
|
|
71
|
+
(bed_chunk, sample_bams, min_depth, min_gap, min_length)
|
|
72
|
+
for bed_chunk in parallel.to_chunks(access_bed)
|
|
73
|
+
)
|
|
75
74
|
for bed_chunk_fname, bait_chunk in pool.map(_scan_depth, args_iter):
|
|
76
75
|
bait_chunks.append(bait_chunk)
|
|
77
76
|
parallel.rm(bed_chunk_fname)
|
|
78
77
|
baits = GA(pd.concat(bait_chunks))
|
|
79
|
-
baits[
|
|
78
|
+
baits["depth"] /= len(sample_bams)
|
|
80
79
|
return baits
|
|
81
80
|
|
|
82
81
|
|
|
83
82
|
def _scan_depth(args):
|
|
84
83
|
"""Wrapper for parallel map"""
|
|
85
84
|
bed_fname, bam_fnames, min_depth, min_gap, min_length = args
|
|
86
|
-
regions = list(
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
85
|
+
regions = list(
|
|
86
|
+
drop_small(
|
|
87
|
+
merge_gaps(scan_depth(bed_fname, bam_fnames, min_depth), min_gap),
|
|
88
|
+
min_length,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
result = pd.DataFrame.from_records(list(regions), columns=regions[0]._fields)
|
|
92
92
|
return bed_fname, result
|
|
93
93
|
|
|
94
94
|
|
|
@@ -100,32 +100,42 @@ def scan_depth(bed_fname, bam_fnames, min_depth):
|
|
|
100
100
|
tuple
|
|
101
101
|
Region coordinates (0-indexed, half-open): chromosome name, start, end
|
|
102
102
|
"""
|
|
103
|
-
Region = collections.namedtuple(
|
|
103
|
+
Region = collections.namedtuple("Region", "chromosome start end depth")
|
|
104
104
|
|
|
105
105
|
nsamples = len(bam_fnames)
|
|
106
106
|
if nsamples == 1:
|
|
107
|
+
|
|
107
108
|
def get_depth(depths):
|
|
108
109
|
return int(depths[0])
|
|
110
|
+
|
|
109
111
|
else:
|
|
110
112
|
min_depth *= nsamples
|
|
113
|
+
|
|
111
114
|
# NB: samtools emits additional BAMs' depths as trailing columns
|
|
112
115
|
def get_depth(depths):
|
|
113
116
|
return sum(map(int, depths))
|
|
114
117
|
|
|
115
|
-
proc = subprocess.Popen(
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
118
|
+
proc = subprocess.Popen(
|
|
119
|
+
[
|
|
120
|
+
SAMTOOLS,
|
|
121
|
+
"depth",
|
|
122
|
+
"-Q",
|
|
123
|
+
"1", # Skip pseudogenes
|
|
124
|
+
"-b",
|
|
125
|
+
bed_fname,
|
|
126
|
+
]
|
|
127
|
+
+ bam_fnames,
|
|
128
|
+
stdout=subprocess.PIPE,
|
|
129
|
+
encoding="utf-8",
|
|
130
|
+
shell=False,
|
|
131
|
+
)
|
|
122
132
|
|
|
123
133
|
# Detect runs of >= min_depth; emit their coordinates
|
|
124
134
|
chrom = start = depths = None
|
|
125
|
-
for line in proc.stdout:
|
|
126
|
-
fields = line.split(
|
|
135
|
+
for line in proc.stdout: # type: ignore
|
|
136
|
+
fields = line.split("\t")
|
|
127
137
|
depth = get_depth(fields[2:])
|
|
128
|
-
is_enriched =
|
|
138
|
+
is_enriched = depth >= min_depth
|
|
129
139
|
if start is None:
|
|
130
140
|
if is_enriched:
|
|
131
141
|
# Entering a new captured region
|
|
@@ -137,7 +147,7 @@ def scan_depth(bed_fname, bam_fnames, min_depth):
|
|
|
137
147
|
continue
|
|
138
148
|
elif is_enriched and fields[0] == chrom:
|
|
139
149
|
# Still in a captured region -- extend it
|
|
140
|
-
depths.append(depth)
|
|
150
|
+
depths.append(depth) # type: ignore
|
|
141
151
|
else:
|
|
142
152
|
# Exiting a captured region
|
|
143
153
|
# Update target region boundaries
|
|
@@ -146,10 +156,12 @@ def scan_depth(bed_fname, bam_fnames, min_depth):
|
|
|
146
156
|
ok_dp_idx = np.nonzero(darr >= half_depth)[0]
|
|
147
157
|
start_idx = ok_dp_idx[0]
|
|
148
158
|
end_idx = ok_dp_idx[-1] + 1
|
|
149
|
-
yield Region(
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
159
|
+
yield Region(
|
|
160
|
+
chrom,
|
|
161
|
+
start + start_idx,
|
|
162
|
+
start + end_idx,
|
|
163
|
+
darr[start_idx:end_idx].mean(),
|
|
164
|
+
)
|
|
153
165
|
chrom = start = depths = None
|
|
154
166
|
|
|
155
167
|
|
|
@@ -170,75 +182,129 @@ def merge_gaps(regions, min_gap):
|
|
|
170
182
|
|
|
171
183
|
def drop_small(regions, min_length):
|
|
172
184
|
"""Merge small gaps and filter by minimum length."""
|
|
173
|
-
return (reg for reg in regions
|
|
174
|
-
if reg.end - reg.start >= min_length)
|
|
185
|
+
return (reg for reg in regions if reg.end - reg.start >= min_length)
|
|
175
186
|
|
|
176
187
|
|
|
177
188
|
# ___________________________________________
|
|
178
189
|
# Shared
|
|
179
190
|
|
|
191
|
+
|
|
180
192
|
def normalize_depth_log2_filter(baits, min_depth, enrich_ratio=0.1):
|
|
181
193
|
"""Calculate normalized depth, add log2 column, filter by enrich_ratio."""
|
|
182
194
|
# Normalize depths to a neutral value of 1.0
|
|
183
|
-
dp_mode = modal_location(baits.data.loc[baits[
|
|
184
|
-
|
|
185
|
-
norm_depth = baits['depth'] / dp_mode
|
|
195
|
+
dp_mode = modal_location(baits.data.loc[baits["depth"] > min_depth, "depth"].values)
|
|
196
|
+
norm_depth = baits["depth"] / dp_mode
|
|
186
197
|
# Drop low-coverage targets
|
|
187
|
-
keep_idx =
|
|
188
|
-
logging.info(
|
|
189
|
-
|
|
198
|
+
keep_idx = norm_depth >= enrich_ratio
|
|
199
|
+
logging.info(
|
|
200
|
+
"Keeping %d/%d bins with coverage depth >= %f, modal depth %f",
|
|
201
|
+
keep_idx.sum(),
|
|
202
|
+
len(keep_idx),
|
|
203
|
+
dp_mode * enrich_ratio,
|
|
204
|
+
dp_mode,
|
|
205
|
+
)
|
|
190
206
|
return baits[keep_idx]
|
|
191
207
|
|
|
192
208
|
|
|
193
|
-
SAMTOOLS =
|
|
209
|
+
SAMTOOLS = "samtools"
|
|
194
210
|
|
|
195
|
-
if __name__ ==
|
|
211
|
+
if __name__ == "__main__":
|
|
196
212
|
AP = argparse.ArgumentParser(description=__doc__)
|
|
197
|
-
AP.add_argument(
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
213
|
+
AP.add_argument(
|
|
214
|
+
"sample_bams",
|
|
215
|
+
nargs="+",
|
|
216
|
+
help="""Sample BAM file(s) to test for target coverage.""",
|
|
217
|
+
)
|
|
218
|
+
AP.add_argument(
|
|
219
|
+
"-o",
|
|
220
|
+
"--output",
|
|
221
|
+
metavar="FILENAME",
|
|
222
|
+
help="""The inferred targets, in BED format.""",
|
|
223
|
+
)
|
|
224
|
+
AP.add_argument(
|
|
225
|
+
"-c",
|
|
226
|
+
"--coverage",
|
|
227
|
+
metavar="FILENAME",
|
|
228
|
+
help="""Filename to output average coverage depths in .cnn
|
|
229
|
+
format.""",
|
|
230
|
+
)
|
|
231
|
+
AP.add_argument(
|
|
232
|
+
"-p",
|
|
233
|
+
"--processes",
|
|
234
|
+
metavar="CPU",
|
|
235
|
+
nargs="?",
|
|
236
|
+
type=int,
|
|
237
|
+
const=0,
|
|
238
|
+
default=1,
|
|
239
|
+
help="""Number of subprocesses to segment in parallel.
|
|
207
240
|
If given without an argument, use the maximum number
|
|
208
|
-
of available CPUs. [Default: use 1 process]"""
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
241
|
+
of available CPUs. [Default: use 1 process]""",
|
|
242
|
+
)
|
|
243
|
+
AP.add_argument(
|
|
244
|
+
"-f",
|
|
245
|
+
"--fasta",
|
|
246
|
+
metavar="FILENAME",
|
|
247
|
+
help="Reference genome, FASTA format (e.g. UCSC hg19.fa)",
|
|
248
|
+
)
|
|
249
|
+
AP.add_argument(
|
|
250
|
+
"-s",
|
|
251
|
+
"--samtools",
|
|
252
|
+
metavar="SAMTOOLS",
|
|
253
|
+
help="Path to samtools",
|
|
254
|
+
default="samtools",
|
|
255
|
+
)
|
|
213
256
|
|
|
214
257
|
AP_x = AP.add_mutually_exclusive_group(required=True)
|
|
215
|
-
AP_x.add_argument(
|
|
216
|
-
|
|
258
|
+
AP_x.add_argument(
|
|
259
|
+
"-t",
|
|
260
|
+
"--targets",
|
|
261
|
+
metavar="TARGET_BED",
|
|
262
|
+
help="""Potentially targeted genomic regions, e.g. all known
|
|
217
263
|
exons in the reference genome, in BED format. Each of these
|
|
218
264
|
regions will be tested as a whole for enrichment. (Faster
|
|
219
|
-
method)"""
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
265
|
+
method)""",
|
|
266
|
+
)
|
|
267
|
+
AP_x.add_argument(
|
|
268
|
+
"-a",
|
|
269
|
+
"--access",
|
|
270
|
+
metavar="ACCESS_BED",
|
|
271
|
+
# default="../data/access-5k-mappable.grch37.bed",
|
|
272
|
+
help="""Sequencing-accessible genomic regions (e.g. from
|
|
223
273
|
'cnvkit.py access'), or known genic regions in the reference
|
|
224
274
|
genome, in BED format. All bases will be tested for
|
|
225
|
-
enrichment. (Slower method)"""
|
|
275
|
+
enrichment. (Slower method)""",
|
|
276
|
+
)
|
|
226
277
|
|
|
227
278
|
AP_target = AP.add_argument_group("With --targets only")
|
|
228
|
-
AP_target.add_argument(
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
279
|
+
AP_target.add_argument(
|
|
280
|
+
"-d",
|
|
281
|
+
"--min-depth",
|
|
282
|
+
metavar="DEPTH",
|
|
283
|
+
type=int,
|
|
284
|
+
default=5,
|
|
285
|
+
help="""Minimum sequencing read depth to accept as captured.
|
|
286
|
+
[Default: %(default)s]""",
|
|
287
|
+
)
|
|
232
288
|
|
|
233
289
|
AP_access = AP.add_argument_group("With --access only")
|
|
234
|
-
AP_access.add_argument(
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
[Default: %(default)s]"""
|
|
290
|
+
AP_access.add_argument(
|
|
291
|
+
"-g",
|
|
292
|
+
"--min-gap",
|
|
293
|
+
metavar="GAP_SIZE",
|
|
294
|
+
type=int,
|
|
295
|
+
default=25,
|
|
296
|
+
help="""Merge regions separated by gaps smaller than this.
|
|
297
|
+
[Default: %(default)s]""",
|
|
298
|
+
)
|
|
299
|
+
AP_access.add_argument(
|
|
300
|
+
"-l",
|
|
301
|
+
"--min-length",
|
|
302
|
+
metavar="TARGET_SIZE",
|
|
303
|
+
type=int,
|
|
304
|
+
default=50,
|
|
305
|
+
help="""Minimum region length to accept as captured.
|
|
306
|
+
[Default: %(default)s]""",
|
|
307
|
+
)
|
|
242
308
|
|
|
243
309
|
args = AP.parse_args()
|
|
244
310
|
SAMTOOLS = args.samtools
|
|
@@ -247,13 +313,20 @@ if __name__ == '__main__':
|
|
|
247
313
|
args.processes = None
|
|
248
314
|
|
|
249
315
|
if args.targets:
|
|
250
|
-
baits = filter_targets(
|
|
316
|
+
baits = filter_targets(
|
|
317
|
+
args.targets, args.sample_bams, args.processes, args.fasta
|
|
318
|
+
)
|
|
251
319
|
else:
|
|
252
|
-
baits = scan_targets(
|
|
253
|
-
|
|
254
|
-
|
|
320
|
+
baits = scan_targets(
|
|
321
|
+
args.access,
|
|
322
|
+
args.sample_bams,
|
|
323
|
+
0.5 * args.min_depth, # More sensitive 1st pass
|
|
324
|
+
args.min_gap,
|
|
325
|
+
args.min_length,
|
|
326
|
+
args.processes,
|
|
327
|
+
)
|
|
255
328
|
baits = normalize_depth_log2_filter(baits, args.min_depth)
|
|
256
|
-
tabio.write(baits, args.output or sys.stdout,
|
|
329
|
+
tabio.write(baits, args.output or sys.stdout, "bed")
|
|
257
330
|
if args.coverage:
|
|
258
|
-
baits[
|
|
259
|
-
tabio.write(baits, args.coverage,
|
|
331
|
+
baits["log2"] = np.log2(baits["depth"] / baits["depth"].median())
|
|
332
|
+
tabio.write(baits, args.coverage, "tab")
|