biodatatools 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biodatatools-0.0.1/PKG-INFO +19 -0
- biodatatools-0.0.1/README.md +4 -0
- biodatatools-0.0.1/biodatatools/__init__.py +234 -0
- biodatatools-0.0.1/biodatatools.egg-info/PKG-INFO +19 -0
- biodatatools-0.0.1/biodatatools.egg-info/SOURCES.txt +9 -0
- biodatatools-0.0.1/biodatatools.egg-info/dependency_links.txt +1 -0
- biodatatools-0.0.1/biodatatools.egg-info/entry_points.txt +2 -0
- biodatatools-0.0.1/biodatatools.egg-info/requires.txt +7 -0
- biodatatools-0.0.1/biodatatools.egg-info/top_level.txt +1 -0
- biodatatools-0.0.1/setup.cfg +4 -0
- biodatatools-0.0.1/setup.py +32 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: biodatatools
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A python package with useful biological data processing methods
|
|
5
|
+
Home-page: https://github.com/aldenleung/biodatatools/
|
|
6
|
+
Author: Alden Leung
|
|
7
|
+
Author-email: alden.leung@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# biodatatools
|
|
17
|
+
|
|
18
|
+
A python package that provides multiple useful functions for biological data processing.
|
|
19
|
+
Many functions from his package require external programs installed.
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import simplevc
|
|
3
|
+
simplevc.register(sys.modules[__name__], "0.0.1")
|
|
4
|
+
|
|
5
|
+
import tempfile
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
import shutil
|
|
9
|
+
|
|
10
|
+
import pysam
|
|
11
|
+
import pyBigWig
|
|
12
|
+
|
|
13
|
+
from biodata.baseio import BaseWriter
|
|
14
|
+
from biodata.bed import BEDPE
|
|
15
|
+
from biodata.delimited import DelimitedReader, DelimitedWriter
|
|
16
|
+
|
|
17
|
+
from commonhelper import convert_to_bool, safe_inverse_zip, nested_default_dict
|
|
18
|
+
from mphelper import ProcessWrapPool
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def check_binaries_validity(*binary_names): # Change to decorator in the future
|
|
24
|
+
missing_binary_names = [binary_name for binary_name in binary_names if shutil.which(binary_name) is None]
|
|
25
|
+
if len(missing_binary_names) > 0:
|
|
26
|
+
raise Exception("The following binaries are not found: " + ",".join(binary_names))
|
|
27
|
+
|
|
28
|
+
def bash_command(cmd):
|
|
29
|
+
p = subprocess.run(cmd, shell=True, executable='/bin/bash')
|
|
30
|
+
if p.returncode != 0:
|
|
31
|
+
raise Exception("Bash command fails: " + cmd)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@vt(
|
|
35
|
+
description="A wrapper to bedGraphToBigWig", helps=dict(
|
|
36
|
+
i="Input bedgraph file", g="chrom size file", o="output bigwig file",
|
|
37
|
+
autosort="Perform sorting on bedgraph file before running bedGraphToBigWig",
|
|
38
|
+
filter_chr="Remove chromosomes in bedgraph file that are not present in chrom.sizes file",
|
|
39
|
+
nthread="Number of threads used in sorting")
|
|
40
|
+
)
|
|
41
|
+
@vc
|
|
42
|
+
def _convert_bedgraph_to_bigwig_20240423(i:str, g:str, o:str, autosort:convert_to_bool=False, filter_chr:convert_to_bool=False, nthread:int=1):
|
|
43
|
+
'''
|
|
44
|
+
Convert bedgraph into bigwig files. Auto sort and filter bedgraphs prior to calling bedGraphToBigWig
|
|
45
|
+
:param i: Input bedgraph file
|
|
46
|
+
:param g: chrom.size file
|
|
47
|
+
:param o: Output bw file
|
|
48
|
+
:param autosort: Perform sorting on bedgraph file before running bedGraphToBigWig
|
|
49
|
+
:param filter_chr: Remove chromosomes in bedgraph file that are not present in chrom.sizes file
|
|
50
|
+
'''
|
|
51
|
+
check_binaries_validity("zcat", "sort", "bedGraphToBigWig")
|
|
52
|
+
tmpfiles = []
|
|
53
|
+
if filter_chr:
|
|
54
|
+
inputfile = tempfile.NamedTemporaryFile(mode='w+', suffix=".bg", delete=False).name
|
|
55
|
+
tmpfiles.append(inputfile)
|
|
56
|
+
with DelimitedReader(g) as dr:
|
|
57
|
+
chromosomes = set([d[0] for d in dr])
|
|
58
|
+
with DelimitedReader(i) as dr, DelimitedWriter(inputfile) as dw:
|
|
59
|
+
for d in dr:
|
|
60
|
+
if d[0] in chromosomes:
|
|
61
|
+
dw.write(d)
|
|
62
|
+
i = inputfile
|
|
63
|
+
|
|
64
|
+
if autosort:
|
|
65
|
+
inputfile = tempfile.NamedTemporaryFile(mode='w+', suffix=".bg", delete=False).name
|
|
66
|
+
tmpfiles.append(inputfile)
|
|
67
|
+
if nthread > 1:
|
|
68
|
+
added_param = f"--parallel={nthread} "
|
|
69
|
+
else:
|
|
70
|
+
added_param = ""
|
|
71
|
+
if i.endswith(".gz"):
|
|
72
|
+
result0 = subprocess.run(f"zcat {i} | sort -k1,1 -k2,2n {added_param}> {inputfile}", shell=True)
|
|
73
|
+
else:
|
|
74
|
+
result0 = subprocess.run(f"sort -k1,1 -k2,2n {added_param}{i} > {inputfile}", shell=True)
|
|
75
|
+
if result0.returncode != 0:
|
|
76
|
+
raise Exception("Exception in sorting bg")
|
|
77
|
+
i = inputfile
|
|
78
|
+
|
|
79
|
+
result = subprocess.run(f"bedGraphToBigWig {i} {g} {o}", shell=True, executable="/bin/bash")
|
|
80
|
+
if result.returncode != 0:
|
|
81
|
+
raise Exception("Exception in running bedGraphToBigWig")
|
|
82
|
+
for tmpfile in tmpfiles:
|
|
83
|
+
os.unlink(tmpfile)
|
|
84
|
+
|
|
85
|
+
@vt(description="Convert GROcap/PROcap/GROseq/PROseq bam file to bigwig files (paired-end reads). Returns 4 bigwig files representing 5' and 3' end of the molecules on plus or minus strand",
|
|
86
|
+
helps=dict(i="Input bam file", g="chrom size file", o="output bigwig file prefix",
|
|
87
|
+
paired_end="True: paired-end sequencing; False: single-end sequencing",
|
|
88
|
+
rna_strand="Indicate whether RNA strand is forward or reverse. In paired-end, forward represents that first read is 5'."
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
@vc
|
|
92
|
+
def _process_PROcap_bam_to_bigwig_20240423(i:str, g:str, o:str, paired_end : convert_to_bool, rna_strand : str):
|
|
93
|
+
'''
|
|
94
|
+
first_read_is_5 --> strand == Forward
|
|
95
|
+
'''
|
|
96
|
+
check_binaries_validity("samtools", "bedtools", "zcat", "sort", "bedGraphToBigWig")
|
|
97
|
+
|
|
98
|
+
tmpfiles = [tempfile.NamedTemporaryFile(mode='w+', suffix=".bg", delete=False).name for _ in range(4)]
|
|
99
|
+
bg5_pl, bg5_mn, bg3_pl, bg3_mn = tmpfiles
|
|
100
|
+
thread = 16
|
|
101
|
+
pwpool = ProcessWrapPool(4)
|
|
102
|
+
if paired_end:
|
|
103
|
+
tmpfiles_bam = [tempfile.NamedTemporaryFile(mode='w+', suffix=".bam", delete=False).name for _ in range(2)]
|
|
104
|
+
bam5, bam3 = tmpfiles_bam
|
|
105
|
+
if rna_strand == "forward":
|
|
106
|
+
bam5_pid = pwpool.run(bash_command, args=[f"samtools view -f 66 --write-index -@ {thread} -o {bam5} {i}"])
|
|
107
|
+
bam3_pid = pwpool.run(bash_command, args=[f"samtools view -f 130 --write-index -@ {thread} -o {bam3} {i}"])
|
|
108
|
+
elif rna_strand == "reverse":
|
|
109
|
+
bam5_pid = pwpool.run(bash_command, args=[f"samtools view -f 130 --write-index -@ {thread} -o {bam5} {i}"])
|
|
110
|
+
bam3_pid = pwpool.run(bash_command, args=[f"samtools view -f 66 --write-index -@ {thread} -o {bam3} {i}"])
|
|
111
|
+
else:
|
|
112
|
+
raise Exception()
|
|
113
|
+
# Be careful of the strand. We assumed F1R2 setup
|
|
114
|
+
bgpl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam5} -5 -strand + -bg > {bg5_pl}"], dependencies=[bam5_pid])
|
|
115
|
+
bgmn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam5} -5 -strand - -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg5_mn}"], dependencies=[bam5_pid])
|
|
116
|
+
bg3pl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam3} -5 -strand - -bg > {bg3_pl}"], dependencies=[bam3_pid])
|
|
117
|
+
bg3mn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam3} -5 -strand + -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg3_mn}"], dependencies=[bam3_pid])
|
|
118
|
+
else:
|
|
119
|
+
tmpfiles_bam = [] # No bam files needed
|
|
120
|
+
bgpl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -5 -strand + -bg > {bg5_pl}"])
|
|
121
|
+
bgmn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -5 -strand - -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg5_mn}"])
|
|
122
|
+
bg3pl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -3 -strand + -bg > {bg3_pl}"])
|
|
123
|
+
bg3mn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -3 -strand - -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg3_mn}"])
|
|
124
|
+
|
|
125
|
+
pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg5_pl, g, o + "_5pl.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bgpl_pid])
|
|
126
|
+
pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg5_mn, g, o + "_5mn.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bgmn_pid])
|
|
127
|
+
pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg3_pl, g, o + "_3pl.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bg3pl_pid])
|
|
128
|
+
pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg3_mn, g, o + "_3mn.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bg3mn_pid])
|
|
129
|
+
pwpool.get(wait=True)
|
|
130
|
+
pwpool.close()
|
|
131
|
+
for tmpfile in tmpfiles + tmpfiles_bam:
|
|
132
|
+
os.unlink(tmpfile)
|
|
133
|
+
|
|
134
|
+
@vt(description="Convert GROcap/PROcap/GROseq/PROseq bam file to bed files Returns 2 bed files with the 4th column as a comma separated list of RNA distances from TSS",
|
|
135
|
+
helps=dict(i="Input bam file", o="output bed file prefix. Two files, _dpl.bed.gz and _dmn.bed.gz are output",
|
|
136
|
+
paired_end="True: paired-end sequencing; False: single-end sequencing",
|
|
137
|
+
rna_strand="Indicate whether RNA strand is forward or reverse. In paired-end, forward represents that first read is 5'.",
|
|
138
|
+
min_rna_len="Minimum RNA length to record",
|
|
139
|
+
max_rna_len="Maximum RNA length to record"
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
@vc
|
|
143
|
+
def _process_PROcap_bam_to_TSS_RNA_len_20240423(i, o, paired_end, rna_strand, min_rna_len=0, max_rna_len=100000, target_chromosomes:str=None):
|
|
144
|
+
'''
|
|
145
|
+
'''
|
|
146
|
+
check_binaries_validity("bgzip")
|
|
147
|
+
def _to_position(alignment):
|
|
148
|
+
position = alignment.reference_end if alignment.is_reverse else (alignment.reference_start + 1)
|
|
149
|
+
strand = "-" if alignment.is_reverse else "+"
|
|
150
|
+
return (alignment.reference_name, position, strand)
|
|
151
|
+
|
|
152
|
+
if not paired_end:
|
|
153
|
+
raise Exception("Single-end not supported yet.")
|
|
154
|
+
saved_reads = {}
|
|
155
|
+
TSS_counter = nested_default_dict(3, list)
|
|
156
|
+
with pysam.AlignmentFile(i) as samfh:
|
|
157
|
+
for alignment in samfh:
|
|
158
|
+
# if target_chromosomes is not None and alignment.reference_name not in target_chromosomes:
|
|
159
|
+
# continue
|
|
160
|
+
if alignment.query_name in saved_reads:
|
|
161
|
+
prev_alignment = saved_reads.pop(alignment.query_name)
|
|
162
|
+
alignment1 = prev_alignment if prev_alignment.is_read1 else alignment
|
|
163
|
+
alignment2 = prev_alignment if prev_alignment.is_read2 else alignment
|
|
164
|
+
p1 = _to_position(alignment1) # read1: Pol
|
|
165
|
+
p2 = _to_position(alignment2) # read2: TSS
|
|
166
|
+
|
|
167
|
+
b = BEDPE(p1[0], p1[1] - 1, p1[1], p2[0], p2[1] - 1, p2[1], strand1 = p1[2], strand2 = p2[2])
|
|
168
|
+
if (b.chrom1 == b.chrom2
|
|
169
|
+
and ( (b.strand1 == "+" and b.strand2 == "-" and b.start1 <= b.start2 and b.stop1 <= b.stop2 and min_rna_len <= b.stop2 - b.start1 <= max_rna_len)
|
|
170
|
+
or (b.strand1 == "-" and b.strand2 == "+" and b.start2 <= b.start1 and b.stop2 <= b.stop1 and min_rna_len <= b.stop1 - b.start2 <= max_rna_len))):
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
d = b.stop2 - b.start1 if b.strand1 == "+" else b.stop1 - b.start2
|
|
174
|
+
if rna_strand == "forward":
|
|
175
|
+
strand = b.strand1
|
|
176
|
+
if strand == "+":
|
|
177
|
+
TSS_counter[strand][b.chrom1][b.genomic_pos1.start].append(d)
|
|
178
|
+
else:
|
|
179
|
+
TSS_counter[strand][b.chrom1][b.genomic_pos1.stop].append(d)
|
|
180
|
+
elif rna_strand == "reverse":
|
|
181
|
+
strand = b.strand2
|
|
182
|
+
if strand == "+":
|
|
183
|
+
TSS_counter[strand][b.chrom1][b.genomic_pos2.stop].append(d)
|
|
184
|
+
else:
|
|
185
|
+
TSS_counter[strand][b.chrom1][b.genomic_pos2.start].append(d)
|
|
186
|
+
else:
|
|
187
|
+
raise Exception()
|
|
188
|
+
|
|
189
|
+
else:
|
|
190
|
+
saved_reads[alignment.query_name] = alignment
|
|
191
|
+
for output_file, strand in zip([f"{o}_dpl.bed.gz", f"{o}_dmn.bed.gz"], ["+", "-"]):
|
|
192
|
+
with BaseWriter(output_file) as bwd:
|
|
193
|
+
regions = TSS_counter[strand]
|
|
194
|
+
for r in sorted(regions.keys()):
|
|
195
|
+
positions = regions[r]
|
|
196
|
+
for p in sorted(positions.keys()):
|
|
197
|
+
v = sorted(positions[p])
|
|
198
|
+
bwd.write(f"{r}\t{p - 1}\t{p}\t{','.join(list(map(str, v)))}\n")
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@vt(description="Modify bigwig files according to the func",
|
|
205
|
+
helps=dict(i="Input bigwig file", o="Output bigwig file", func="Function to modify bigwig. Either a python function or a string to be evaluated as python lambda function. For example, to convert all positive values into negative values, 'lambda x: x * -1'")
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
@vc
|
|
209
|
+
def _modify_bigwig_values_20240423(i:str, o:str, func:str):
|
|
210
|
+
'''
|
|
211
|
+
Extract all intervals that overlap with the selected regions
|
|
212
|
+
'''
|
|
213
|
+
if isinstance(func, str):
|
|
214
|
+
func = eval(func, {}) # While unsafe to use eval, disable access to global variables to make it a little bit safer..
|
|
215
|
+
input_bw = pyBigWig.open(i)
|
|
216
|
+
def _get_pyBigWig_all_interval_generator(bw):
|
|
217
|
+
for chrom in bw.chroms():
|
|
218
|
+
if bw.intervals(chrom) is not None:
|
|
219
|
+
for interval in bw.intervals(chrom):
|
|
220
|
+
yield (chrom, *interval)
|
|
221
|
+
output_bw = pyBigWig.open(o, "w")
|
|
222
|
+
output_bw.addHeader(list(input_bw.chroms().items()))
|
|
223
|
+
all_intervals = list(_get_pyBigWig_all_interval_generator(input_bw))
|
|
224
|
+
chroms, starts, ends, values = safe_inverse_zip(all_intervals, 4)
|
|
225
|
+
values = list(map(func, values))
|
|
226
|
+
output_bw.addEntries(list(chroms), list(starts), ends=list(ends), values=list(values))
|
|
227
|
+
output_bw.close()
|
|
228
|
+
input_bw.close()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
main()
|
|
233
|
+
|
|
234
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: biodatatools
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A python package with useful biological data processing methods
|
|
5
|
+
Home-page: https://github.com/aldenleung/biodatatools/
|
|
6
|
+
Author: Alden Leung
|
|
7
|
+
Author-email: alden.leung@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# biodatatools
|
|
17
|
+
|
|
18
|
+
A python package that provides multiple useful functions for biological data processing.
|
|
19
|
+
Many functions from his package require external programs installed.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
biodatatools/__init__.py
|
|
4
|
+
biodatatools.egg-info/PKG-INFO
|
|
5
|
+
biodatatools.egg-info/SOURCES.txt
|
|
6
|
+
biodatatools.egg-info/dependency_links.txt
|
|
7
|
+
biodatatools.egg-info/entry_points.txt
|
|
8
|
+
biodatatools.egg-info/requires.txt
|
|
9
|
+
biodatatools.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
biodatatools
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r") as readme_file:
|
|
4
|
+
readme = readme_file.read()
|
|
5
|
+
|
|
6
|
+
requirements = ["genomictools>=0.0.7", "biodata>=0.0.6", "pysam>=0.22.1", "pyBigWig>=0.3.22", "simplevc>=0.0.2", "commonhelper>=0.0.1", "mphelper>=0.0.1"]
|
|
7
|
+
|
|
8
|
+
setup(
|
|
9
|
+
name="biodatatools",
|
|
10
|
+
version="0.0.1",
|
|
11
|
+
author="Alden Leung",
|
|
12
|
+
author_email="alden.leung@gmail.com",
|
|
13
|
+
description="A python package with useful biological data processing methods",
|
|
14
|
+
long_description=readme,
|
|
15
|
+
long_description_content_type="text/markdown",
|
|
16
|
+
url="https://github.com/aldenleung/biodatatools/",
|
|
17
|
+
packages=find_packages(),
|
|
18
|
+
install_requires=requirements,
|
|
19
|
+
classifiers=[
|
|
20
|
+
"Programming Language :: Python :: 3.7",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
26
|
+
],
|
|
27
|
+
entry_points = {
|
|
28
|
+
'console_scripts': [
|
|
29
|
+
'biodatatools = biodatatools:main',
|
|
30
|
+
],
|
|
31
|
+
}
|
|
32
|
+
)
|