biopipen 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (118) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +35 -23
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +428 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +20 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +94 -148
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  59. biopipen/scripts/scrna/MarkersFinder.R +273 -654
  60. biopipen/scripts/scrna/RadarPlots.R +73 -53
  61. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  62. biopipen/scripts/scrna/ScVelo.py +0 -0
  63. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
  64. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
  65. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
  66. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
  67. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
  68. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  69. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
  70. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  71. biopipen/scripts/scrna/SlingShot.R +71 -0
  72. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  73. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  74. biopipen/scripts/snp/PlinkFilter.py +7 -7
  75. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  76. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  77. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  78. biopipen/scripts/stats/ChowTest.R +48 -22
  79. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  80. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  81. biopipen/scripts/tcr/ClonalStats.R +484 -0
  82. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  83. biopipen/scripts/tcr/TCRDock.py +10 -6
  84. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  85. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  86. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  87. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  88. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  89. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  90. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  91. biopipen/scripts/vcf/VcfAnno.py +11 -11
  92. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  93. biopipen/scripts/vcf/VcfFilter.py +5 -5
  94. biopipen/scripts/vcf/VcfFix.py +7 -7
  95. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  96. biopipen/scripts/vcf/VcfIndex.py +3 -3
  97. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  98. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  99. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  100. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  101. biopipen/scripts/web/Download.py +8 -4
  102. biopipen/scripts/web/DownloadList.py +5 -5
  103. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  104. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  105. biopipen/scripts/web/gcloud_common.py +1 -1
  106. biopipen/utils/gsea.R +75 -35
  107. biopipen/utils/misc.R +205 -7
  108. biopipen/utils/misc.py +17 -8
  109. biopipen/utils/reference.py +11 -11
  110. biopipen/utils/repr.R +146 -0
  111. biopipen/utils/vcf.py +1 -1
  112. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/METADATA +8 -8
  113. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/RECORD +115 -105
  114. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/WHEEL +1 -1
  115. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
  116. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
  117. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
  118. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/entry_points.txt +0 -0
@@ -4,9 +4,9 @@ from diot import Diot
4
4
 
5
5
  from biopipen.utils.misc import run_command, dict_to_cli_args
6
6
 
7
- segfiles = {{in.segfiles | repr}} # pyright: ignore # noqa
7
+ segfiles = {{in.segfiles | repr}} # pyright: ignore # noqa # noqa
8
8
  sample_sex = {{in.sample_sex | repr}} # pyright: ignore
9
- outdir = {{out.outdir | repr}} # pyright: ignore
9
+ outdir: str = {{out.outdir | repr}} # pyright: ignore
10
10
  cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
11
11
  convert = {{envs.convert | quote}} # pyright: ignore
12
12
  convert_args = {{envs.convert_args | repr}} # pyright: ignore
@@ -16,7 +16,7 @@ desaturate= {{ envs.desaturate | repr}} # pyright: ignore
16
16
  male_reference= {{ envs.male_reference | repr}} # pyright: ignore
17
17
  no_shift_xy= {{ envs.no_shift_xy | repr}} # pyright: ignore
18
18
  order = {{envs.order | repr}} # pyright: ignore
19
- cases = {{envs.cases | repr}} # pyright: ignore
19
+ cases: dict | None = {{envs.cases | repr}} # pyright: ignore
20
20
 
21
21
 
22
22
  def parse_order(files, orderfile):
@@ -70,7 +70,7 @@ def do_case(name, case):
70
70
  args[""] = [cnvkit, "heatmap"]
71
71
  run_command(dict_to_cli_args(args, dashify=True), fg=True)
72
72
 
73
- conv_args = dict(**conv_args, _=[pdffile, pngfile])
73
+ conv_args: dict = dict(**conv_args, _=[pdffile, pngfile])
74
74
  conv_args[""] = [convert]
75
75
  run_command(
76
76
  dict_to_cli_args(conv_args, dashify=True, prefix="-"),
@@ -1,12 +1,12 @@
1
1
  from pathlib import Path
2
2
  from biopipen.utils.misc import run_command, dict_to_cli_args
3
3
 
4
- covfiles = {{in.covfiles | repr}} # pyright: ignore
4
+ covfiles = {{in.covfiles | repr}} # pyright: ignore # noqa
5
5
  target_file = {{in.target_file | repr}} # pyright: ignore
6
6
  antitarget_file = {{in.antitarget_file | repr}} # pyright: ignore
7
7
  sample_sex = {{in.sample_sex | repr}} # pyright: ignore
8
8
  outfile = {{out.outfile | quote}} # pyright: ignore
9
- reffile = {{envs.ref | repr}} # pyright: ignore
9
+ reffile: str = {{envs.ref | quote}} # pyright: ignore
10
10
  cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
11
11
  cluster = {{envs.cluster | repr}} # pyright: ignore
12
12
  min_cluster_size = {{envs.min_cluster_size | repr}} # pyright: ignore
@@ -4,14 +4,14 @@ from diot import Diot
4
4
 
5
5
  from biopipen.utils.misc import run_command, dict_to_cli_args
6
6
 
7
- cnrfile = {{in.cnrfile | quote}} # pyright: ignore
7
+ cnrfile = {{in.cnrfile | quote}} # pyright: ignore # noqa
8
8
  cnsfile = {{in.cnsfile | quote}} # pyright: ignore
9
9
  convert = {{envs.convert | quote}} # pyright: ignore
10
10
  convert_args = {{envs.convert_args | repr}} # pyright: ignore
11
11
  vcf = {{in.vcf | repr}} # pyright: ignore
12
12
  sample_id = {{in.sample_id | repr}} # pyright: ignore
13
13
  normal_id = {{in.normal_id | repr}} # pyright: ignore
14
- outdir = {{out.outdir | quote}} # pyright: ignore
14
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
15
15
  cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
16
16
  chromosome = {{envs.chromosome | repr}} # pyright: ignore
17
17
  gene = {{envs.gene | repr}} # pyright: ignore
@@ -25,7 +25,7 @@ y_min = {{envs.y_min | repr}} # pyright: ignore
25
25
  min_variant_depth = {{envs.min_variant_depth | repr}} # pyright: ignore
26
26
  zygosity_freq = {{envs.zygosity_freq | repr}} # pyright: ignore
27
27
  title = {{envs.title | repr}} # pyright: ignore
28
- cases = {{envs.cases | repr}} # pyright: ignore
28
+ cases: dict | None = {{envs.cases | repr}} # pyright: ignore
29
29
 
30
30
 
31
31
  def do_case(name, case):
@@ -50,7 +50,7 @@ def do_case(name, case):
50
50
  pdffile = Path(outdir).joinpath(f"{name}.heatmap.pdf")
51
51
  pngfile = Path(outdir).joinpath(f"{name}.heatmap.png")
52
52
 
53
- args = dict(
53
+ args: dict = dict(
54
54
  **case,
55
55
  s=cnsfile,
56
56
  o=pdffile,
@@ -62,7 +62,7 @@ def do_case(name, case):
62
62
  args[""] = [cnvkit, "scatter"]
63
63
  run_command(dict_to_cli_args(args, dashify=True), fg=True)
64
64
 
65
- conv_args = dict(**conv_args, _=[pdffile, pngfile])
65
+ conv_args: dict = dict(**conv_args, _=[pdffile, pngfile])
66
66
  conv_args[""] = [convert]
67
67
  run_command(
68
68
  dict_to_cli_args(conv_args, dashify=True, prefix="-"),
@@ -2,11 +2,11 @@ from pathlib import Path
2
2
 
3
3
  from biopipen.utils.misc import run_command, dict_to_cli_args
4
4
 
5
- cnrfile = {{in.cnrfile | quote}} # pyright: ignore
5
+ cnrfile = {{in.cnrfile | quote}} # pyright: ignore # noqa
6
6
  vcf = {{in.vcf | repr}} # pyright: ignore
7
7
  sample_id = {{in.sample_id | repr}} # pyright: ignore
8
8
  normal_id = {{in.normal_id | repr}} # pyright: ignore
9
- outfile = {{out.outfile | quote}} # pyright: ignore
9
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
10
10
  cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
11
11
  method = {{envs.method | quote}} # pyright: ignore
12
12
  threshold = {{envs.threshold | repr}} # pyright: ignore
@@ -21,7 +21,7 @@ zygosity_freq = {{envs.zygosity_freq | repr}} # pyright: ignore
21
21
 
22
22
  def main():
23
23
 
24
- args = dict(
24
+ args: dict = dict(
25
25
  o=outfile,
26
26
  d=Path(outfile).parent / "intermediate.rds",
27
27
  m=method,
@@ -39,8 +39,8 @@ def main():
39
39
  _=cnrfile,
40
40
  )
41
41
  args[""] = [cnvkit, "segment"]
42
- args = dict_to_cli_args(args, dashify=True)
43
- run_command(args, fg=True)
42
+ cmd_args = dict_to_cli_args(args, dashify=True)
43
+ run_command(cmd_args, fg=True)
44
44
 
45
45
 
46
46
  if __name__ == "__main__":
@@ -25,10 +25,10 @@ import sys
25
25
  import numpy as np
26
26
  import pandas as pd
27
27
 
28
- import cnvlib
29
- from cnvlib import parallel
30
- from cnvlib.descriptives import modal_location
31
- from skgenome import tabio, GenomicArray as GA
28
+ import cnvlib # type: ignore
29
+ from cnvlib import parallel # type: ignore
30
+ from cnvlib.descriptives import modal_location # type: ignore
31
+ from skgenome import tabio, GenomicArray as GA # type: ignore
32
32
 
33
33
  logging.basicConfig(level=logging.INFO, format="%(message)s")
34
34
 
@@ -36,11 +36,12 @@ logging.basicConfig(level=logging.INFO, format="%(message)s")
36
36
  # ___________________________________________
37
37
  # Guided method: guess from potential targets
38
38
 
39
+
39
40
  def filter_targets(target_bed, sample_bams, procs, fasta):
40
41
  """Check if each potential target has significant coverage."""
41
42
  try:
42
- baits = tabio.read(target_bed, 'bed4')
43
- except:
43
+ baits = tabio.read(target_bed, "bed4")
44
+ except: # noqa
44
45
  raise RuntimeError("Targets must be in BED format; try skg_convert.py")
45
46
  logging.info("Loaded %d candidate regions from %s", len(baits), target_bed)
46
47
  # Loop over BAMs to calculate weighted averages of bin coverage depths
@@ -48,47 +49,46 @@ def filter_targets(target_bed, sample_bams, procs, fasta):
48
49
  for bam_fname in sample_bams:
49
50
  logging.info("Evaluating targets in %s", bam_fname)
50
51
  sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs, fasta=fasta)
51
- assert len(sample) == len(baits), \
52
- "%d != %d" % (len(sample), len(baits))
53
- total_depths += sample['depth'].values
54
- baits['depth'] = total_depths / len(sample_bams)
55
- logging.info("Average candidate-target depth:\n%s",
56
- baits['depth'].describe())
52
+ assert len(sample) == len(baits), "%d != %d" % (len(sample), len(baits))
53
+ total_depths += sample["depth"].values
54
+ baits["depth"] = total_depths / len(sample_bams)
55
+ logging.info("Average candidate-target depth:\n%s", baits["depth"].describe())
57
56
  return baits
58
57
 
59
58
 
60
59
  # _________________________________________
61
60
  # Unguided method: guess from raw depths
62
61
 
63
- def scan_targets(access_bed, sample_bams, min_depth, min_gap, min_length,
64
- procs):
62
+
63
+ def scan_targets(access_bed, sample_bams, min_depth, min_gap, min_length, procs):
65
64
  """Estimate baited regions from a genome-wide, per-base depth profile."""
66
65
  bait_chunks = []
67
66
  # ENH: context manager to call rm on bed chunks? with to_chunks as pool, ck?
68
- logging.info("Scanning for enriched regions in:\n %s",
69
- '\n '.join(sample_bams))
67
+ logging.info("Scanning for enriched regions in:\n %s", "\n ".join(sample_bams))
70
68
  # with futures.ProcessPoolExecutor(procs) as pool:
71
69
  with parallel.pick_pool(procs) as pool:
72
- args_iter = ((bed_chunk, sample_bams,
73
- min_depth, min_gap, min_length)
74
- for bed_chunk in parallel.to_chunks(access_bed))
70
+ args_iter = (
71
+ (bed_chunk, sample_bams, min_depth, min_gap, min_length)
72
+ for bed_chunk in parallel.to_chunks(access_bed)
73
+ )
75
74
  for bed_chunk_fname, bait_chunk in pool.map(_scan_depth, args_iter):
76
75
  bait_chunks.append(bait_chunk)
77
76
  parallel.rm(bed_chunk_fname)
78
77
  baits = GA(pd.concat(bait_chunks))
79
- baits['depth'] /= len(sample_bams)
78
+ baits["depth"] /= len(sample_bams)
80
79
  return baits
81
80
 
82
81
 
83
82
  def _scan_depth(args):
84
83
  """Wrapper for parallel map"""
85
84
  bed_fname, bam_fnames, min_depth, min_gap, min_length = args
86
- regions = list(drop_small(merge_gaps(scan_depth(bed_fname, bam_fnames,
87
- min_depth),
88
- min_gap),
89
- min_length))
90
- result = pd.DataFrame.from_records(list(regions),
91
- columns=regions[0]._fields)
85
+ regions = list(
86
+ drop_small(
87
+ merge_gaps(scan_depth(bed_fname, bam_fnames, min_depth), min_gap),
88
+ min_length,
89
+ )
90
+ )
91
+ result = pd.DataFrame.from_records(list(regions), columns=regions[0]._fields)
92
92
  return bed_fname, result
93
93
 
94
94
 
@@ -100,32 +100,42 @@ def scan_depth(bed_fname, bam_fnames, min_depth):
100
100
  tuple
101
101
  Region coordinates (0-indexed, half-open): chromosome name, start, end
102
102
  """
103
- Region = collections.namedtuple('Region', 'chromosome start end depth')
103
+ Region = collections.namedtuple("Region", "chromosome start end depth")
104
104
 
105
105
  nsamples = len(bam_fnames)
106
106
  if nsamples == 1:
107
+
107
108
  def get_depth(depths):
108
109
  return int(depths[0])
110
+
109
111
  else:
110
112
  min_depth *= nsamples
113
+
111
114
  # NB: samtools emits additional BAMs' depths as trailing columns
112
115
  def get_depth(depths):
113
116
  return sum(map(int, depths))
114
117
 
115
- proc = subprocess.Popen([SAMTOOLS, 'depth',
116
- '-Q', '1', # Skip pseudogenes
117
- '-b', bed_fname,
118
- ] + bam_fnames,
119
- stdout=subprocess.PIPE,
120
- encoding='utf-8',
121
- shell=False)
118
+ proc = subprocess.Popen(
119
+ [
120
+ SAMTOOLS,
121
+ "depth",
122
+ "-Q",
123
+ "1", # Skip pseudogenes
124
+ "-b",
125
+ bed_fname,
126
+ ]
127
+ + bam_fnames,
128
+ stdout=subprocess.PIPE,
129
+ encoding="utf-8",
130
+ shell=False,
131
+ )
122
132
 
123
133
  # Detect runs of >= min_depth; emit their coordinates
124
134
  chrom = start = depths = None
125
- for line in proc.stdout:
126
- fields = line.split('\t')
135
+ for line in proc.stdout: # type: ignore
136
+ fields = line.split("\t")
127
137
  depth = get_depth(fields[2:])
128
- is_enriched = (depth >= min_depth)
138
+ is_enriched = depth >= min_depth
129
139
  if start is None:
130
140
  if is_enriched:
131
141
  # Entering a new captured region
@@ -137,7 +147,7 @@ def scan_depth(bed_fname, bam_fnames, min_depth):
137
147
  continue
138
148
  elif is_enriched and fields[0] == chrom:
139
149
  # Still in a captured region -- extend it
140
- depths.append(depth)
150
+ depths.append(depth) # type: ignore
141
151
  else:
142
152
  # Exiting a captured region
143
153
  # Update target region boundaries
@@ -146,10 +156,12 @@ def scan_depth(bed_fname, bam_fnames, min_depth):
146
156
  ok_dp_idx = np.nonzero(darr >= half_depth)[0]
147
157
  start_idx = ok_dp_idx[0]
148
158
  end_idx = ok_dp_idx[-1] + 1
149
- yield Region(chrom,
150
- start + start_idx,
151
- start + end_idx,
152
- darr[start_idx:end_idx].mean())
159
+ yield Region(
160
+ chrom,
161
+ start + start_idx,
162
+ start + end_idx,
163
+ darr[start_idx:end_idx].mean(),
164
+ )
153
165
  chrom = start = depths = None
154
166
 
155
167
 
@@ -170,75 +182,129 @@ def merge_gaps(regions, min_gap):
170
182
 
171
183
  def drop_small(regions, min_length):
172
184
  """Merge small gaps and filter by minimum length."""
173
- return (reg for reg in regions
174
- if reg.end - reg.start >= min_length)
185
+ return (reg for reg in regions if reg.end - reg.start >= min_length)
175
186
 
176
187
 
177
188
  # ___________________________________________
178
189
  # Shared
179
190
 
191
+
180
192
  def normalize_depth_log2_filter(baits, min_depth, enrich_ratio=0.1):
181
193
  """Calculate normalized depth, add log2 column, filter by enrich_ratio."""
182
194
  # Normalize depths to a neutral value of 1.0
183
- dp_mode = modal_location(baits.data.loc[baits['depth'] > min_depth,
184
- 'depth'].values)
185
- norm_depth = baits['depth'] / dp_mode
195
+ dp_mode = modal_location(baits.data.loc[baits["depth"] > min_depth, "depth"].values)
196
+ norm_depth = baits["depth"] / dp_mode
186
197
  # Drop low-coverage targets
187
- keep_idx = (norm_depth >= enrich_ratio)
188
- logging.info("Keeping %d/%d bins with coverage depth >= %f, modal depth %f",
189
- keep_idx.sum(), len(keep_idx), dp_mode * enrich_ratio, dp_mode)
198
+ keep_idx = norm_depth >= enrich_ratio
199
+ logging.info(
200
+ "Keeping %d/%d bins with coverage depth >= %f, modal depth %f",
201
+ keep_idx.sum(),
202
+ len(keep_idx),
203
+ dp_mode * enrich_ratio,
204
+ dp_mode,
205
+ )
190
206
  return baits[keep_idx]
191
207
 
192
208
 
193
- SAMTOOLS = 'samtools'
209
+ SAMTOOLS = "samtools"
194
210
 
195
- if __name__ == '__main__':
211
+ if __name__ == "__main__":
196
212
  AP = argparse.ArgumentParser(description=__doc__)
197
- AP.add_argument('sample_bams', nargs='+',
198
- help="""Sample BAM file(s) to test for target coverage.""")
199
- AP.add_argument('-o', '--output', metavar='FILENAME',
200
- help="""The inferred targets, in BED format.""")
201
- AP.add_argument('-c', '--coverage', metavar='FILENAME',
202
- help="""Filename to output average coverage depths in .cnn
203
- format.""")
204
- AP.add_argument('-p', '--processes', metavar='CPU',
205
- nargs='?', type=int, const=0, default=1,
206
- help="""Number of subprocesses to segment in parallel.
213
+ AP.add_argument(
214
+ "sample_bams",
215
+ nargs="+",
216
+ help="""Sample BAM file(s) to test for target coverage.""",
217
+ )
218
+ AP.add_argument(
219
+ "-o",
220
+ "--output",
221
+ metavar="FILENAME",
222
+ help="""The inferred targets, in BED format.""",
223
+ )
224
+ AP.add_argument(
225
+ "-c",
226
+ "--coverage",
227
+ metavar="FILENAME",
228
+ help="""Filename to output average coverage depths in .cnn
229
+ format.""",
230
+ )
231
+ AP.add_argument(
232
+ "-p",
233
+ "--processes",
234
+ metavar="CPU",
235
+ nargs="?",
236
+ type=int,
237
+ const=0,
238
+ default=1,
239
+ help="""Number of subprocesses to segment in parallel.
207
240
  If given without an argument, use the maximum number
208
- of available CPUs. [Default: use 1 process]""")
209
- AP.add_argument('-f', '--fasta', metavar="FILENAME",
210
- help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
211
- AP.add_argument('-s', '--samtools', metavar="SAMTOOLS",
212
- help="Path to samtools", default="samtools")
241
+ of available CPUs. [Default: use 1 process]""",
242
+ )
243
+ AP.add_argument(
244
+ "-f",
245
+ "--fasta",
246
+ metavar="FILENAME",
247
+ help="Reference genome, FASTA format (e.g. UCSC hg19.fa)",
248
+ )
249
+ AP.add_argument(
250
+ "-s",
251
+ "--samtools",
252
+ metavar="SAMTOOLS",
253
+ help="Path to samtools",
254
+ default="samtools",
255
+ )
213
256
 
214
257
  AP_x = AP.add_mutually_exclusive_group(required=True)
215
- AP_x.add_argument('-t', '--targets', metavar='TARGET_BED',
216
- help="""Potentially targeted genomic regions, e.g. all known
258
+ AP_x.add_argument(
259
+ "-t",
260
+ "--targets",
261
+ metavar="TARGET_BED",
262
+ help="""Potentially targeted genomic regions, e.g. all known
217
263
  exons in the reference genome, in BED format. Each of these
218
264
  regions will be tested as a whole for enrichment. (Faster
219
- method)""")
220
- AP_x.add_argument('-a', '--access', metavar='ACCESS_BED',
221
- # default="../data/access-5k-mappable.grch37.bed",
222
- help="""Sequencing-accessible genomic regions (e.g. from
265
+ method)""",
266
+ )
267
+ AP_x.add_argument(
268
+ "-a",
269
+ "--access",
270
+ metavar="ACCESS_BED",
271
+ # default="../data/access-5k-mappable.grch37.bed",
272
+ help="""Sequencing-accessible genomic regions (e.g. from
223
273
  'cnvkit.py access'), or known genic regions in the reference
224
274
  genome, in BED format. All bases will be tested for
225
- enrichment. (Slower method)""")
275
+ enrichment. (Slower method)""",
276
+ )
226
277
 
227
278
  AP_target = AP.add_argument_group("With --targets only")
228
- AP_target.add_argument('-d', '--min-depth', metavar='DEPTH',
229
- type=int, default=5,
230
- help="""Minimum sequencing read depth to accept as captured.
231
- [Default: %(default)s]""")
279
+ AP_target.add_argument(
280
+ "-d",
281
+ "--min-depth",
282
+ metavar="DEPTH",
283
+ type=int,
284
+ default=5,
285
+ help="""Minimum sequencing read depth to accept as captured.
286
+ [Default: %(default)s]""",
287
+ )
232
288
 
233
289
  AP_access = AP.add_argument_group("With --access only")
234
- AP_access.add_argument('-g', '--min-gap', metavar='GAP_SIZE',
235
- type=int, default=25,
236
- help="""Merge regions separated by gaps smaller than this.
237
- [Default: %(default)s]""")
238
- AP_access.add_argument('-l', '--min-length', metavar='TARGET_SIZE',
239
- type=int, default=50,
240
- help="""Minimum region length to accept as captured.
241
- [Default: %(default)s]""")
290
+ AP_access.add_argument(
291
+ "-g",
292
+ "--min-gap",
293
+ metavar="GAP_SIZE",
294
+ type=int,
295
+ default=25,
296
+ help="""Merge regions separated by gaps smaller than this.
297
+ [Default: %(default)s]""",
298
+ )
299
+ AP_access.add_argument(
300
+ "-l",
301
+ "--min-length",
302
+ metavar="TARGET_SIZE",
303
+ type=int,
304
+ default=50,
305
+ help="""Minimum region length to accept as captured.
306
+ [Default: %(default)s]""",
307
+ )
242
308
 
243
309
  args = AP.parse_args()
244
310
  SAMTOOLS = args.samtools
@@ -247,13 +313,20 @@ if __name__ == '__main__':
247
313
  args.processes = None
248
314
 
249
315
  if args.targets:
250
- baits = filter_targets(args.targets, args.sample_bams, args.processes, args.fasta)
316
+ baits = filter_targets(
317
+ args.targets, args.sample_bams, args.processes, args.fasta
318
+ )
251
319
  else:
252
- baits = scan_targets(args.access, args.sample_bams,
253
- 0.5 * args.min_depth, # More sensitive 1st pass
254
- args.min_gap, args.min_length, args.processes)
320
+ baits = scan_targets(
321
+ args.access,
322
+ args.sample_bams,
323
+ 0.5 * args.min_depth, # More sensitive 1st pass
324
+ args.min_gap,
325
+ args.min_length,
326
+ args.processes,
327
+ )
255
328
  baits = normalize_depth_log2_filter(baits, args.min_depth)
256
- tabio.write(baits, args.output or sys.stdout, 'bed')
329
+ tabio.write(baits, args.output or sys.stdout, "bed")
257
330
  if args.coverage:
258
- baits['log2'] = np.log2(baits['depth'] / baits['depth'].median())
259
- tabio.write(baits, args.coverage, 'tab')
331
+ baits["log2"] = np.log2(baits["depth"] / baits["depth"].median())
332
+ tabio.write(baits, args.coverage, "tab")