biodatatools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: biodatatools
3
+ Version: 0.0.1
4
+ Summary: A python package with useful biological data processing methods
5
+ Home-page: https://github.com/aldenleung/biodatatools/
6
+ Author: Alden Leung
7
+ Author-email: alden.leung@gmail.com
8
+ Classifier: Programming Language :: Python :: 3.7
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
+ Description-Content-Type: text/markdown
15
+
16
+ # biodatatools
17
+
18
+ A python package that provides multiple useful functions for biological data processing.
19
+ Many functions from his package require external programs installed.
@@ -0,0 +1,4 @@
1
+ # biodatatools
2
+
3
+ A python package that provides multiple useful functions for biological data processing.
4
+ Many functions from his package require external programs installed.
@@ -0,0 +1,234 @@
1
+ import sys
2
+ import simplevc
3
+ simplevc.register(sys.modules[__name__], "0.0.1")
4
+
5
+ import tempfile
6
+ import os
7
+ import subprocess
8
+ import shutil
9
+
10
+ import pysam
11
+ import pyBigWig
12
+
13
+ from biodata.baseio import BaseWriter
14
+ from biodata.bed import BEDPE
15
+ from biodata.delimited import DelimitedReader, DelimitedWriter
16
+
17
+ from commonhelper import convert_to_bool, safe_inverse_zip, nested_default_dict
18
+ from mphelper import ProcessWrapPool
19
+
20
+
21
+
22
+
23
+ def check_binaries_validity(*binary_names): # Change to decorator in the future
24
+ missing_binary_names = [binary_name for binary_name in binary_names if shutil.which(binary_name) is None]
25
+ if len(missing_binary_names) > 0:
26
+ raise Exception("The following binaries are not found: " + ",".join(binary_names))
27
+
28
+ def bash_command(cmd):
29
+ p = subprocess.run(cmd, shell=True, executable='/bin/bash')
30
+ if p.returncode != 0:
31
+ raise Exception("Bash command fails: " + cmd)
32
+
33
+
34
+ @vt(
35
+ description="A wrapper to bedGraphToBigWig", helps=dict(
36
+ i="Input bedgraph file", g="chrom size file", o="output bigwig file",
37
+ autosort="Perform sorting on bedgraph file before running bedGraphToBigWig",
38
+ filter_chr="Remove chromosomes in bedgraph file that are not present in chrom.sizes file",
39
+ nthread="Number of threads used in sorting")
40
+ )
41
+ @vc
42
+ def _convert_bedgraph_to_bigwig_20240423(i:str, g:str, o:str, autosort:convert_to_bool=False, filter_chr:convert_to_bool=False, nthread:int=1):
43
+ '''
44
+ Convert bedgraph into bigwig files. Auto sort and filter bedgraphs prior to calling bedGraphToBigWig
45
+ :param i: Input bedgraph file
46
+ :param g: chrom.size file
47
+ :param o: Output bw file
48
+ :param autosort: Perform sorting on bedgraph file before running bedGraphToBigWig
49
+ :param filter_chr: Remove chromosomes in bedgraph file that are not present in chrom.sizes file
50
+ '''
51
+ check_binaries_validity("zcat", "sort", "bedGraphToBigWig")
52
+ tmpfiles = []
53
+ if filter_chr:
54
+ inputfile = tempfile.NamedTemporaryFile(mode='w+', suffix=".bg", delete=False).name
55
+ tmpfiles.append(inputfile)
56
+ with DelimitedReader(g) as dr:
57
+ chromosomes = set([d[0] for d in dr])
58
+ with DelimitedReader(i) as dr, DelimitedWriter(inputfile) as dw:
59
+ for d in dr:
60
+ if d[0] in chromosomes:
61
+ dw.write(d)
62
+ i = inputfile
63
+
64
+ if autosort:
65
+ inputfile = tempfile.NamedTemporaryFile(mode='w+', suffix=".bg", delete=False).name
66
+ tmpfiles.append(inputfile)
67
+ if nthread > 1:
68
+ added_param = f"--parallel={nthread} "
69
+ else:
70
+ added_param = ""
71
+ if i.endswith(".gz"):
72
+ result0 = subprocess.run(f"zcat {i} | sort -k1,1 -k2,2n {added_param}> {inputfile}", shell=True)
73
+ else:
74
+ result0 = subprocess.run(f"sort -k1,1 -k2,2n {added_param}{i} > {inputfile}", shell=True)
75
+ if result0.returncode != 0:
76
+ raise Exception("Exception in sorting bg")
77
+ i = inputfile
78
+
79
+ result = subprocess.run(f"bedGraphToBigWig {i} {g} {o}", shell=True, executable="/bin/bash")
80
+ if result.returncode != 0:
81
+ raise Exception("Exception in running bedGraphToBigWig")
82
+ for tmpfile in tmpfiles:
83
+ os.unlink(tmpfile)
84
+
85
+ @vt(description="Convert GROcap/PROcap/GROseq/PROseq bam file to bigwig files (paired-end reads). Returns 4 bigwig files representing 5' and 3' end of the molecules on plus or minus strand",
86
+ helps=dict(i="Input bam file", g="chrom size file", o="output bigwig file prefix",
87
+ paired_end="True: paired-end sequencing; False: single-end sequencing",
88
+ rna_strand="Indicate whether RNA strand is forward or reverse. In paired-end, forward represents that first read is 5'."
89
+ )
90
+ )
91
+ @vc
92
+ def _process_PROcap_bam_to_bigwig_20240423(i:str, g:str, o:str, paired_end : convert_to_bool, rna_strand : str):
93
+ '''
94
+ first_read_is_5 --> strand == Forward
95
+ '''
96
+ check_binaries_validity("samtools", "bedtools", "zcat", "sort", "bedGraphToBigWig")
97
+
98
+ tmpfiles = [tempfile.NamedTemporaryFile(mode='w+', suffix=".bg", delete=False).name for _ in range(4)]
99
+ bg5_pl, bg5_mn, bg3_pl, bg3_mn = tmpfiles
100
+ thread = 16
101
+ pwpool = ProcessWrapPool(4)
102
+ if paired_end:
103
+ tmpfiles_bam = [tempfile.NamedTemporaryFile(mode='w+', suffix=".bam", delete=False).name for _ in range(2)]
104
+ bam5, bam3 = tmpfiles_bam
105
+ if rna_strand == "forward":
106
+ bam5_pid = pwpool.run(bash_command, args=[f"samtools view -f 66 --write-index -@ {thread} -o {bam5} {i}"])
107
+ bam3_pid = pwpool.run(bash_command, args=[f"samtools view -f 130 --write-index -@ {thread} -o {bam3} {i}"])
108
+ elif rna_strand == "reverse":
109
+ bam5_pid = pwpool.run(bash_command, args=[f"samtools view -f 130 --write-index -@ {thread} -o {bam5} {i}"])
110
+ bam3_pid = pwpool.run(bash_command, args=[f"samtools view -f 66 --write-index -@ {thread} -o {bam3} {i}"])
111
+ else:
112
+ raise Exception()
113
+ # Be careful of the strand. We assumed F1R2 setup
114
+ bgpl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam5} -5 -strand + -bg > {bg5_pl}"], dependencies=[bam5_pid])
115
+ bgmn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam5} -5 -strand - -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg5_mn}"], dependencies=[bam5_pid])
116
+ bg3pl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam3} -5 -strand - -bg > {bg3_pl}"], dependencies=[bam3_pid])
117
+ bg3mn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {bam3} -5 -strand + -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg3_mn}"], dependencies=[bam3_pid])
118
+ else:
119
+ tmpfiles_bam = [] # No bam files needed
120
+ bgpl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -5 -strand + -bg > {bg5_pl}"])
121
+ bgmn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -5 -strand - -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg5_mn}"])
122
+ bg3pl_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -3 -strand + -bg > {bg3_pl}"])
123
+ bg3mn_pid = pwpool.run(bash_command, args=[f"bedtools genomecov -ibam {i} -3 -strand - -bg | awk {{'printf (\"%s\\t%s\\t%s\\t-%s\\n\", $1, $2, $3, $4)'}} > {bg3_mn}"])
124
+
125
+ pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg5_pl, g, o + "_5pl.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bgpl_pid])
126
+ pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg5_mn, g, o + "_5mn.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bgmn_pid])
127
+ pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg3_pl, g, o + "_3pl.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bg3pl_pid])
128
+ pwpool.run(_convert_bedgraph_to_bigwig_20240423, args=[bg3_mn, g, o + "_3mn.bw"], kwargs=dict(autosort=True, filter_chr=True), dependencies=[bg3mn_pid])
129
+ pwpool.get(wait=True)
130
+ pwpool.close()
131
+ for tmpfile in tmpfiles + tmpfiles_bam:
132
+ os.unlink(tmpfile)
133
+
134
+ @vt(description="Convert GROcap/PROcap/GROseq/PROseq bam file to bed files Returns 2 bed files with the 4th column as a comma separated list of RNA distances from TSS",
135
+ helps=dict(i="Input bam file", o="output bed file prefix. Two files, _dpl.bed.gz and _dmn.bed.gz are output",
136
+ paired_end="True: paired-end sequencing; False: single-end sequencing",
137
+ rna_strand="Indicate whether RNA strand is forward or reverse. In paired-end, forward represents that first read is 5'.",
138
+ min_rna_len="Minimum RNA length to record",
139
+ max_rna_len="Maximum RNA length to record"
140
+ )
141
+ )
142
+ @vc
143
+ def _process_PROcap_bam_to_TSS_RNA_len_20240423(i, o, paired_end, rna_strand, min_rna_len=0, max_rna_len=100000, target_chromosomes:str=None):
144
+ '''
145
+ '''
146
+ check_binaries_validity("bgzip")
147
+ def _to_position(alignment):
148
+ position = alignment.reference_end if alignment.is_reverse else (alignment.reference_start + 1)
149
+ strand = "-" if alignment.is_reverse else "+"
150
+ return (alignment.reference_name, position, strand)
151
+
152
+ if not paired_end:
153
+ raise Exception("Single-end not supported yet.")
154
+ saved_reads = {}
155
+ TSS_counter = nested_default_dict(3, list)
156
+ with pysam.AlignmentFile(i) as samfh:
157
+ for alignment in samfh:
158
+ # if target_chromosomes is not None and alignment.reference_name not in target_chromosomes:
159
+ # continue
160
+ if alignment.query_name in saved_reads:
161
+ prev_alignment = saved_reads.pop(alignment.query_name)
162
+ alignment1 = prev_alignment if prev_alignment.is_read1 else alignment
163
+ alignment2 = prev_alignment if prev_alignment.is_read2 else alignment
164
+ p1 = _to_position(alignment1) # read1: Pol
165
+ p2 = _to_position(alignment2) # read2: TSS
166
+
167
+ b = BEDPE(p1[0], p1[1] - 1, p1[1], p2[0], p2[1] - 1, p2[1], strand1 = p1[2], strand2 = p2[2])
168
+ if (b.chrom1 == b.chrom2
169
+ and ( (b.strand1 == "+" and b.strand2 == "-" and b.start1 <= b.start2 and b.stop1 <= b.stop2 and min_rna_len <= b.stop2 - b.start1 <= max_rna_len)
170
+ or (b.strand1 == "-" and b.strand2 == "+" and b.start2 <= b.start1 and b.stop2 <= b.stop1 and min_rna_len <= b.stop1 - b.start2 <= max_rna_len))):
171
+
172
+
173
+ d = b.stop2 - b.start1 if b.strand1 == "+" else b.stop1 - b.start2
174
+ if rna_strand == "forward":
175
+ strand = b.strand1
176
+ if strand == "+":
177
+ TSS_counter[strand][b.chrom1][b.genomic_pos1.start].append(d)
178
+ else:
179
+ TSS_counter[strand][b.chrom1][b.genomic_pos1.stop].append(d)
180
+ elif rna_strand == "reverse":
181
+ strand = b.strand2
182
+ if strand == "+":
183
+ TSS_counter[strand][b.chrom1][b.genomic_pos2.stop].append(d)
184
+ else:
185
+ TSS_counter[strand][b.chrom1][b.genomic_pos2.start].append(d)
186
+ else:
187
+ raise Exception()
188
+
189
+ else:
190
+ saved_reads[alignment.query_name] = alignment
191
+ for output_file, strand in zip([f"{o}_dpl.bed.gz", f"{o}_dmn.bed.gz"], ["+", "-"]):
192
+ with BaseWriter(output_file) as bwd:
193
+ regions = TSS_counter[strand]
194
+ for r in sorted(regions.keys()):
195
+ positions = regions[r]
196
+ for p in sorted(positions.keys()):
197
+ v = sorted(positions[p])
198
+ bwd.write(f"{r}\t{p - 1}\t{p}\t{','.join(list(map(str, v)))}\n")
199
+
200
+
201
+
202
+
203
+
204
+ @vt(description="Modify bigwig files according to the func",
205
+ helps=dict(i="Input bigwig file", o="Output bigwig file", func="Function to modify bigwig. Either a python function or a string to be evaluated as python lambda function. For example, to convert all positive values into negative values, 'lambda x: x * -1'")
206
+ )
207
+
208
+ @vc
209
+ def _modify_bigwig_values_20240423(i:str, o:str, func:str):
210
+ '''
211
+ Extract all intervals that overlap with the selected regions
212
+ '''
213
+ if isinstance(func, str):
214
+ func = eval(func, {}) # While unsafe to use eval, disable access to global variables to make it a little bit safer..
215
+ input_bw = pyBigWig.open(i)
216
+ def _get_pyBigWig_all_interval_generator(bw):
217
+ for chrom in bw.chroms():
218
+ if bw.intervals(chrom) is not None:
219
+ for interval in bw.intervals(chrom):
220
+ yield (chrom, *interval)
221
+ output_bw = pyBigWig.open(o, "w")
222
+ output_bw.addHeader(list(input_bw.chroms().items()))
223
+ all_intervals = list(_get_pyBigWig_all_interval_generator(input_bw))
224
+ chroms, starts, ends, values = safe_inverse_zip(all_intervals, 4)
225
+ values = list(map(func, values))
226
+ output_bw.addEntries(list(chroms), list(starts), ends=list(ends), values=list(values))
227
+ output_bw.close()
228
+ input_bw.close()
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main()
233
+
234
+
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: biodatatools
3
+ Version: 0.0.1
4
+ Summary: A python package with useful biological data processing methods
5
+ Home-page: https://github.com/aldenleung/biodatatools/
6
+ Author: Alden Leung
7
+ Author-email: alden.leung@gmail.com
8
+ Classifier: Programming Language :: Python :: 3.7
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
+ Description-Content-Type: text/markdown
15
+
16
+ # biodatatools
17
+
18
+ A python package that provides multiple useful functions for biological data processing.
19
+ Many functions from his package require external programs installed.
@@ -0,0 +1,9 @@
1
+ README.md
2
+ setup.py
3
+ biodatatools/__init__.py
4
+ biodatatools.egg-info/PKG-INFO
5
+ biodatatools.egg-info/SOURCES.txt
6
+ biodatatools.egg-info/dependency_links.txt
7
+ biodatatools.egg-info/entry_points.txt
8
+ biodatatools.egg-info/requires.txt
9
+ biodatatools.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ biodatatools = biodatatools:main
@@ -0,0 +1,7 @@
1
+ genomictools>=0.0.7
2
+ biodata>=0.0.6
3
+ pysam>=0.22.1
4
+ pyBigWig>=0.3.22
5
+ simplevc>=0.0.2
6
+ commonhelper>=0.0.1
7
+ mphelper>=0.0.1
@@ -0,0 +1 @@
1
+ biodatatools
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,32 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r") as readme_file:
4
+ readme = readme_file.read()
5
+
6
+ requirements = ["genomictools>=0.0.7", "biodata>=0.0.6", "pysam>=0.22.1", "pyBigWig>=0.3.22", "simplevc>=0.0.2", "commonhelper>=0.0.1", "mphelper>=0.0.1"]
7
+
8
+ setup(
9
+ name="biodatatools",
10
+ version="0.0.1",
11
+ author="Alden Leung",
12
+ author_email="alden.leung@gmail.com",
13
+ description="A python package with useful biological data processing methods",
14
+ long_description=readme,
15
+ long_description_content_type="text/markdown",
16
+ url="https://github.com/aldenleung/biodatatools/",
17
+ packages=find_packages(),
18
+ install_requires=requirements,
19
+ classifiers=[
20
+ "Programming Language :: Python :: 3.7",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
26
+ ],
27
+ entry_points = {
28
+ 'console_scripts': [
29
+ 'biodatatools = biodatatools:main',
30
+ ],
31
+ }
32
+ )