biastools 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biastools/__init__.py +3 -0
- biastools/biastools.py +307 -0
- biastools/biastools_align.sh +38 -0
- biastools/biastools_analysis.sh +73 -0
- biastools/biastools_compare.sh +14 -0
- biastools/biastools_predict.sh +23 -0
- biastools/biastools_scan.py +165 -0
- biastools/biastools_simulation.sh +57 -0
- biastools/compare_bias_with_RD.py +125 -0
- biastools/consensus_vcf_map_adaptive.py +693 -0
- biastools/filter_het_VCF.py +33 -0
- biastools/golden_graph.py +301 -0
- biastools/golden_graph_report.py +196 -0
- biastools/indel_balance_plot.py +348 -0
- biastools/merge_baseline.py +159 -0
- biastools/predict_experiment.py +186 -0
- biastools/predict_model.py +44 -0
- biastools/ref_bi_context.1021.py +809 -0
- biastools/ref_bi_context.py +887 -0
- biastools/ref_bi_naive.py +527 -0
- biastools/sample_baseline.py +151 -0
- biastools/scanning_bias.py +470 -0
- biastools/vcf_to_bed.py +23 -0
- biastools-0.3.2.data/data/biastools/biastools_align.sh +38 -0
- biastools-0.3.2.data/data/biastools/biastools_analysis.sh +73 -0
- biastools-0.3.2.data/data/biastools/biastools_compare.sh +14 -0
- biastools-0.3.2.data/data/biastools/biastools_predict.sh +23 -0
- biastools-0.3.2.data/data/biastools/biastools_simulation.sh +57 -0
- biastools-0.3.2.dist-info/METADATA +36 -0
- biastools-0.3.2.dist-info/RECORD +34 -0
- biastools-0.3.2.dist-info/WHEEL +5 -0
- biastools-0.3.2.dist-info/entry_points.txt +3 -0
- biastools-0.3.2.dist-info/licenses/LICENSE +21 -0
- biastools-0.3.2.dist-info/top_level.txt +1 -0
biastools/__init__.py
ADDED
biastools/biastools.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# Wrap up python file for the biastools 1st and 2nd module
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
import argparse
|
|
6
|
+
from shutil import which
|
|
7
|
+
|
|
8
|
+
def is_tool(name):
|
|
9
|
+
"""Check whether `name` is on PATH and marked as executable."""
|
|
10
|
+
return which(name) is not None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def check_program_install(list_names):
|
|
14
|
+
flag_violate = False
|
|
15
|
+
for name in list_names:
|
|
16
|
+
if is_tool(name) == False:
|
|
17
|
+
print(name, "is a prerequisite program, please install it before running biastools")
|
|
18
|
+
flag_violate = True
|
|
19
|
+
if flag_violate:
|
|
20
|
+
print("Use --force option if you want to disable the prerequisite program check.")
|
|
21
|
+
exit(1)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def bool2str(flag):
|
|
25
|
+
if flag:
|
|
26
|
+
return "1"
|
|
27
|
+
else:
|
|
28
|
+
return "0"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def catch_assert(parser, message):
|
|
32
|
+
print('\n', message, '\n')
|
|
33
|
+
parser.print_usage()
|
|
34
|
+
exit(1)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def ensure_processed_vcf(path_ref, path_vcf, path_output, sample_id, path_module, parser):
|
|
38
|
+
"""
|
|
39
|
+
Ensure that the processed heterozygous VCF (<out>/<sample_id>.het.vcf.gz) exists.
|
|
40
|
+
If it doesn't, regenerate it in the same way as in the simulation/analysis shell scripts.
|
|
41
|
+
"""
|
|
42
|
+
het_vcf_path = path_output + '/' + sample_id + '.het.vcf.gz'
|
|
43
|
+
if os.path.exists(het_vcf_path):
|
|
44
|
+
return het_vcf_path
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
assert path_ref is not None
|
|
48
|
+
assert path_vcf is not None
|
|
49
|
+
except AssertionError:
|
|
50
|
+
catch_assert(
|
|
51
|
+
parser,
|
|
52
|
+
"<genome> and <vcf> should be specified when using --analyze "
|
|
53
|
+
"if the processed VCF does not already exist in the output directory.",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
prefix = path_output + '/' + sample_id
|
|
57
|
+
print("[Biastools] Prepare processed VCF for analysis...")
|
|
58
|
+
# Normalize the input VCF against the reference
|
|
59
|
+
subprocess.check_call([
|
|
60
|
+
'bcftools', 'norm',
|
|
61
|
+
'-f', path_ref,
|
|
62
|
+
path_vcf,
|
|
63
|
+
'-m', '+any',
|
|
64
|
+
'-Oz',
|
|
65
|
+
'-o', prefix + '.normalized.vcf.gz',
|
|
66
|
+
])
|
|
67
|
+
subprocess.check_call(['bcftools', 'index', prefix + '.normalized.vcf.gz'])
|
|
68
|
+
|
|
69
|
+
# Filter heterozygous sites and index the resulting VCF
|
|
70
|
+
subprocess.check_call([
|
|
71
|
+
'python3',
|
|
72
|
+
path_module + 'filter_het_VCF.py',
|
|
73
|
+
'-v', prefix + '.normalized.vcf.gz',
|
|
74
|
+
'-o', het_vcf_path,
|
|
75
|
+
])
|
|
76
|
+
subprocess.check_call(['tabix', '-p', 'vcf', het_vcf_path])
|
|
77
|
+
|
|
78
|
+
return het_vcf_path
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def main():
|
|
82
|
+
parser = argparse.ArgumentParser(description="Simulation/Alignment/Analyzing/Prediction module of the Biastools v0.3.2")
|
|
83
|
+
parser.add_argument('--version', action='version', version='%(prog)s 0.3.2')
|
|
84
|
+
parser.add_argument('-o', '--out', help="Path to output directory ['out_dir'].", default="out_dir")
|
|
85
|
+
parser.add_argument('-g', '--genome', help="Path to the reference genome.")
|
|
86
|
+
parser.add_argument('-v', '--vcf', nargs='+',
|
|
87
|
+
help="Path(s) to VCF file(s). For simulation/align/single-run analyze, "
|
|
88
|
+
"provide exactly one. For multi-report analyze, you may provide "
|
|
89
|
+
"one VCF shared by all reports or one per report.")
|
|
90
|
+
parser.add_argument('-s', '--sample_id', help="Sample ID ['sample'].", default="sample")
|
|
91
|
+
parser.add_argument('-r', '--run_id', help="Run ID ['run'].", default="run")
|
|
92
|
+
# Process options
|
|
93
|
+
parser.add_argument('--simulate', help='[1] Option to run biastools simulation.', action='store_true')
|
|
94
|
+
parser.add_argument('--align', help='[2] Option to run biastools align.', action='store_true')
|
|
95
|
+
parser.add_argument('--analyze', help='[3] Option to run biastools analyze.', action='store_true')
|
|
96
|
+
parser.add_argument('--predict', help='[4] Option to predict bias from analysis report.', action='store_true')
|
|
97
|
+
|
|
98
|
+
parser.add_argument('-t', '--thread', help="Number of threads to use [max].", type=int)
|
|
99
|
+
parser.add_argument('--force', help="running the program without checking prerequisite programs.", action='store_true')
|
|
100
|
+
# [1]
|
|
101
|
+
parser.add_argument('-x', '--coverage', help="Read coverage to simulate [30].", type=int, default=30)
|
|
102
|
+
# [2]
|
|
103
|
+
parser.add_argument('-a', '--aligner', help="Aligner to use (bowtie2|bwamem) [bowtie2]", default="bowtie2")
|
|
104
|
+
parser.add_argument('-b', '--align_index', help="Path to the aligner index (target reference)")
|
|
105
|
+
# [3]
|
|
106
|
+
parser.add_argument('-i', '--bam', help="Path to the alignment bam file, should be sorted [out_dir/sample.run_id.sorted.bam].")
|
|
107
|
+
parser.add_argument('-n', '--naive', help= "Option to run the naive assignment method [False].", action='store_true')
|
|
108
|
+
parser.add_argument('-R', '--real', help= "Option for performing analysis on real data [False].", action='store_true')
|
|
109
|
+
parser.add_argument('-d', '--boundary', help= "Boundary to plot the indel balance plot [20]", type=int, default=20)
|
|
110
|
+
parser.add_argument('-lr', '--list_report', help= "List of bias report to plot the indel balance plot", nargs='+')
|
|
111
|
+
parser.add_argument('-ld', '--list_run_id', help= "List of run ID for namings in the indel balance plot", nargs='+')
|
|
112
|
+
# [4]
|
|
113
|
+
parser.add_argument('-ps', '--sim_report', help= "Path to the simulation report.")
|
|
114
|
+
parser.add_argument('-pr', '--real_report', help= "Path to the real read report [out_dir/sample.real.run.bias].")
|
|
115
|
+
args = parser.parse_args()
|
|
116
|
+
|
|
117
|
+
##### Parameters for biastool_analysis
|
|
118
|
+
path_output = args.out
|
|
119
|
+
path_ref = args.genome
|
|
120
|
+
vcf_list = args.vcf # may be None or a list of one/many paths
|
|
121
|
+
# For steps that only support a single VCF (simulate/align/single-run analyze),
|
|
122
|
+
# use the first entry when provided.
|
|
123
|
+
path_vcf = vcf_list[0] if vcf_list else None
|
|
124
|
+
sample_id = args.sample_id
|
|
125
|
+
run_id = args.run_id
|
|
126
|
+
bam_file = args.bam
|
|
127
|
+
if bam_file == None:
|
|
128
|
+
bam_file = path_output + '/' + sample_id + '.' + run_id + '.sorted.bam'
|
|
129
|
+
|
|
130
|
+
flag_simulate = args.simulate
|
|
131
|
+
flag_align = args.align
|
|
132
|
+
flag_analyze = args.analyze
|
|
133
|
+
flag_predict = args.predict
|
|
134
|
+
|
|
135
|
+
path_module = os.path.dirname(__file__) + '/'
|
|
136
|
+
try:
|
|
137
|
+
assert flag_simulate + flag_align + flag_analyze + flag_predict >= 1
|
|
138
|
+
except AssertionError:
|
|
139
|
+
catch_assert(parser, "At least one of the --simulate/align/analyze/predict option should be specified.")
|
|
140
|
+
|
|
141
|
+
flag_force = args.force
|
|
142
|
+
thread = args.thread
|
|
143
|
+
if thread == None:
|
|
144
|
+
if sys.platform == "darwin":
|
|
145
|
+
result = subprocess.run(["sysctl -n hw.ncpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
|
|
146
|
+
else:
|
|
147
|
+
result = subprocess.run(["nproc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
|
|
148
|
+
thread = int(result.stdout.strip())
|
|
149
|
+
|
|
150
|
+
coverage = args.coverage
|
|
151
|
+
aligner = args.aligner
|
|
152
|
+
align_index = args.align_index
|
|
153
|
+
try:
|
|
154
|
+
assert aligner=="bowtie2" or aligner=="bwamem"
|
|
155
|
+
except AssertionError:
|
|
156
|
+
catch_assert(parser, "Only bowtie2 and bwamem are supported.")
|
|
157
|
+
|
|
158
|
+
flag_naive = args.naive
|
|
159
|
+
flag_real = args.real
|
|
160
|
+
boundary = args.boundary
|
|
161
|
+
list_report = args.list_report
|
|
162
|
+
list_run_id = args.list_run_id
|
|
163
|
+
if list_report:
|
|
164
|
+
try:
|
|
165
|
+
assert len(list_report) == len(list_run_id)
|
|
166
|
+
except AssertionError:
|
|
167
|
+
catch_assert(parser, "Number of list --list_report and --list_run_id entries are inconsistent.")
|
|
168
|
+
|
|
169
|
+
# Multiple VCF paths are only meaningful for multi-report analyze mode,
|
|
170
|
+
# where each bias report can have its own VCF. For all other modes, require
|
|
171
|
+
# at most one VCF path.
|
|
172
|
+
if vcf_list and len(vcf_list) > 1:
|
|
173
|
+
if not (flag_analyze and list_report):
|
|
174
|
+
catch_assert(
|
|
175
|
+
parser,
|
|
176
|
+
"Multiple --vcf paths are only supported when using --analyze "
|
|
177
|
+
"together with --list_report/--list_run_id.",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
sim_report = args.sim_report
|
|
181
|
+
real_report = args.real_report
|
|
182
|
+
if flag_predict:
|
|
183
|
+
try:
|
|
184
|
+
assert real_report != None
|
|
185
|
+
except AssertionError:
|
|
186
|
+
catch_assert(parser, "<real_report> should be specified when using --predict")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# Checking prerequisite programs are installed
|
|
191
|
+
if flag_force != True:
|
|
192
|
+
list_program = ["bedtools", \
|
|
193
|
+
"samtools", \
|
|
194
|
+
"bcftools", \
|
|
195
|
+
"gzip", \
|
|
196
|
+
"tabix"]
|
|
197
|
+
if flag_align:
|
|
198
|
+
list_program += ["bwa", "bowtie2"]
|
|
199
|
+
if flag_simulate:
|
|
200
|
+
list_program.append("mason_simulator")
|
|
201
|
+
check_program_install( list_program )
|
|
202
|
+
|
|
203
|
+
# Start running
|
|
204
|
+
command = "mkdir -p " + path_output
|
|
205
|
+
subprocess.call(command, shell=True)
|
|
206
|
+
|
|
207
|
+
if flag_simulate:
|
|
208
|
+
try:
|
|
209
|
+
assert path_ref != None
|
|
210
|
+
assert path_vcf != None
|
|
211
|
+
except AssertionError:
|
|
212
|
+
catch_assert(parser, "<genome> and <vcf> should be specified when using --simulate")
|
|
213
|
+
print("[Biastools] Simulate...")
|
|
214
|
+
command = ' '.join(["bash", path_module+"biastools_simulation.sh", path_ref, path_vcf, path_output, sample_id, str(thread), str(coverage), path_module])
|
|
215
|
+
#print(command)
|
|
216
|
+
subprocess.call(command, shell=True)
|
|
217
|
+
if flag_align:
|
|
218
|
+
try:
|
|
219
|
+
assert path_ref != None
|
|
220
|
+
assert path_vcf != None
|
|
221
|
+
except AssertionError:
|
|
222
|
+
catch_assert(parser, "<genome> and <vcf> should be specified when using --align")
|
|
223
|
+
if align_index == None:
|
|
224
|
+
align_index = path_ref
|
|
225
|
+
print("[Biastools] Align...")
|
|
226
|
+
command = ' '.join(["bash", path_module+"biastools_align.sh", path_ref, path_vcf, path_output, sample_id, str(thread), aligner, align_index, run_id, path_module])
|
|
227
|
+
#print(command)
|
|
228
|
+
subprocess.call(command, shell=True)
|
|
229
|
+
if flag_analyze:
|
|
230
|
+
if list_report != None:
|
|
231
|
+
# Multi-report indel balance plotting.
|
|
232
|
+
# If multiple VCFs are provided, pass them directly through to
|
|
233
|
+
# indel_balance_plot.py, one per report. Otherwise, ensure the
|
|
234
|
+
# processed VCF for this sample exists and use it for all reports.
|
|
235
|
+
if vcf_list and len(vcf_list) > 1:
|
|
236
|
+
if len(vcf_list) != len(list_report):
|
|
237
|
+
catch_assert(
|
|
238
|
+
parser,
|
|
239
|
+
"When providing multiple --vcf paths, their number must "
|
|
240
|
+
"match the number of --list_report entries.",
|
|
241
|
+
)
|
|
242
|
+
vcf_args = ["-vcf"] + vcf_list
|
|
243
|
+
else:
|
|
244
|
+
# Ensure the processed VCF exists (or regenerate it) before plotting.
|
|
245
|
+
het_vcf_path = ensure_processed_vcf(
|
|
246
|
+
path_ref=path_ref,
|
|
247
|
+
path_vcf=path_vcf,
|
|
248
|
+
path_output=path_output,
|
|
249
|
+
sample_id=sample_id,
|
|
250
|
+
path_module=path_module,
|
|
251
|
+
parser=parser,
|
|
252
|
+
)
|
|
253
|
+
vcf_args = ["-vcf", het_vcf_path]
|
|
254
|
+
|
|
255
|
+
print("[Biastools] Plot the indel balance plot for multiple bias reports...")
|
|
256
|
+
if flag_real:
|
|
257
|
+
subprocess.call(
|
|
258
|
+
['python3', path_module+'indel_balance_plot.py', "-lr"] + list_report
|
|
259
|
+
+ ["-ln"] + list_run_id
|
|
260
|
+
+ vcf_args
|
|
261
|
+
+ ["-bd", str(boundary), "-map",
|
|
262
|
+
"-out", path_output+"/"+sample_id+"."+run_id+".real", "-real"]
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
subprocess.call(
|
|
266
|
+
['python3', path_module+'indel_balance_plot.py', "-lr"] + list_report
|
|
267
|
+
+ ["-ln"] + list_run_id
|
|
268
|
+
+ vcf_args
|
|
269
|
+
+ ["-bd", str(boundary), "-map",
|
|
270
|
+
"-out", path_output+"/"+sample_id+"."+run_id+".sim"]
|
|
271
|
+
)
|
|
272
|
+
else:
|
|
273
|
+
try:
|
|
274
|
+
assert path_ref != None
|
|
275
|
+
assert path_vcf != None
|
|
276
|
+
except AssertionError:
|
|
277
|
+
catch_assert(parser, "<genome> and <vcf> should be specified when using --analyze")
|
|
278
|
+
# When running analyze directly, ensure that the BAM file is either
|
|
279
|
+
# explicitly provided or present at the default location.
|
|
280
|
+
if not os.path.exists(bam_file):
|
|
281
|
+
catch_assert(
|
|
282
|
+
parser,
|
|
283
|
+
"BAM file not found. Please specify --bam <file> explicitly "
|
|
284
|
+
"or ensure it exists at the default path: "
|
|
285
|
+
f"{path_output}/{sample_id}.{run_id}.sorted.bam",
|
|
286
|
+
)
|
|
287
|
+
print("[Biastools] Analyze and plot...")
|
|
288
|
+
command = ' '.join(["bash", path_module+"biastools_analysis.sh", path_ref, path_vcf, path_output, sample_id, str(thread), run_id, bool2str(flag_real), \
|
|
289
|
+
bool2str(flag_naive), str(boundary), path_module, bam_file])
|
|
290
|
+
#print(command)
|
|
291
|
+
subprocess.call(command, shell=True)
|
|
292
|
+
if flag_predict:
|
|
293
|
+
print("[Biastools] Predict bias...")
|
|
294
|
+
command = ' '.join(["bash", path_module+"biastools_predict.sh", path_output, sample_id, run_id, bool2str(flag_real), real_report, sim_report, path_module])
|
|
295
|
+
#print(command)
|
|
296
|
+
subprocess.call(command, shell=True)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
if __name__ == "__main__":
|
|
303
|
+
main()
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
path_ref=$1
|
|
2
|
+
path_vcf=$2
|
|
3
|
+
path_out=$3
|
|
4
|
+
sample_id=$4
|
|
5
|
+
THR=$5
|
|
6
|
+
ALN=$6
|
|
7
|
+
ALN_IDX=$7
|
|
8
|
+
run_id=$8
|
|
9
|
+
path_module=$9
|
|
10
|
+
prefix=${path_out}/${sample_id}
|
|
11
|
+
|
|
12
|
+
echo "[Biastools] Align sequences to the original reference"
|
|
13
|
+
if [[ ${ALN_IDX} == 'none' ]]; then
|
|
14
|
+
ALN_IDX=${path_ref}
|
|
15
|
+
fi
|
|
16
|
+
|
|
17
|
+
if [[ ${ALN} == "bowtie2" ]]; then
|
|
18
|
+
echo "[Biastools] Align with bowtie2"
|
|
19
|
+
if [ ! -f ${ALN_IDX}.1.bt2 ]; then
|
|
20
|
+
bowtie2-build ${path_ref} ${ALN_IDX}
|
|
21
|
+
fi
|
|
22
|
+
bowtie2 -p ${THR} -x ${ALN_IDX} --rg-id ${run_id}_hapA --rg SM:${sample_id} -1 ${prefix}.hapA_1.fq.gz -2 ${prefix}.hapA_2.fq.gz |\
|
|
23
|
+
samtools sort -o ${prefix}.hapA.${run_id}.sorted.bam
|
|
24
|
+
bowtie2 -p ${THR} -x ${ALN_IDX} --rg-id ${run_id}_hapB --rg SM:${sample_id} -1 ${prefix}.hapB_1.fq.gz -2 ${prefix}.hapB_2.fq.gz |\
|
|
25
|
+
samtools sort -o ${prefix}.hapB.${run_id}.sorted.bam
|
|
26
|
+
elif [[ ${ALN} == "bwamem" ]]; then
|
|
27
|
+
echo "[Biastools] Align with BWA MEM"
|
|
28
|
+
if [ ! -f ${ALN_IDX}.bwt ]; then
|
|
29
|
+
bwa index ${path_ref} -p ${ALN_IDX}
|
|
30
|
+
fi
|
|
31
|
+
bwa mem -t ${THR} ${ALN_IDX} ${prefix}.hapA_1.fq.gz ${prefix}.hapA_2.fq.gz -R "@RG\tID:${run_id}_hapA\tSM:${sample_id}" |\
|
|
32
|
+
samtools sort -@ ${THR} -o ${prefix}.hapA.${run_id}.sorted.bam -
|
|
33
|
+
bwa mem -t ${THR} ${ALN_IDX} ${prefix}.hapB_1.fq.gz ${prefix}.hapB_2.fq.gz -R "@RG\tID:${run_id}_hapB\tSM:${sample_id}" |\
|
|
34
|
+
samtools sort -@ ${THR} -o ${prefix}.hapB.${run_id}.sorted.bam -
|
|
35
|
+
fi
|
|
36
|
+
samtools merge -f ${prefix}.${run_id}.sorted.bam ${prefix}.hapA.${run_id}.sorted.bam ${prefix}.hapB.${run_id}.sorted.bam
|
|
37
|
+
|
|
38
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
path_ref=$1
|
|
2
|
+
path_vcf=$2
|
|
3
|
+
path_out=$3
|
|
4
|
+
sample_id=$4
|
|
5
|
+
THR=$5
|
|
6
|
+
run_id=$6
|
|
7
|
+
flag_real=$7
|
|
8
|
+
flag_naive=$8
|
|
9
|
+
boundary=$9
|
|
10
|
+
path_module=${10}
|
|
11
|
+
prefix=${path_out}/${sample_id}
|
|
12
|
+
bam_file=${11}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
echo "[Biastools] Intersect the bam file and vcf file"
|
|
16
|
+
if [ ! -f ${prefix}.het.vcf.gz ]; then
|
|
17
|
+
bcftools norm -f ${path_ref} ${path_vcf} -m +any -Oz -o ${prefix}.normalized.vcf.gz
|
|
18
|
+
bcftools index ${prefix}.normalized.vcf.gz
|
|
19
|
+
python3 ${path_module}filter_het_VCF.py -v ${prefix}.normalized.vcf.gz -o ${prefix}.het.vcf.gz
|
|
20
|
+
tabix -p vcf ${prefix}.het.vcf.gz
|
|
21
|
+
fi
|
|
22
|
+
if [ ! -f ${prefix}.${run_id}.sorted.het.bam ]; then
|
|
23
|
+
python3 ${path_module}vcf_to_bed.py -v ${prefix}.het.vcf.gz -o ${prefix}.het.bed
|
|
24
|
+
samtools view -h -L ${prefix}.het.bed ${bam_file} -@ ${THR} | samtools sort -@ ${THR} > ${prefix}.${run_id}.sorted.het.bam
|
|
25
|
+
samtools index ${prefix}.${run_id}.sorted.het.bam
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
echo "[Biastools] Reference bias analysis"
|
|
30
|
+
if [[ ${flag_naive} == 1 ]]; then
|
|
31
|
+
assign_method=${path_module}"ref_bi_naive.py"
|
|
32
|
+
else
|
|
33
|
+
assign_method=${path_module}"ref_bi_context.py"
|
|
34
|
+
fi
|
|
35
|
+
|
|
36
|
+
mkdir -p ${path_out}/${run_id}"_report"
|
|
37
|
+
r_prefix=${path_out}/${run_id}"_report"/${sample_id}
|
|
38
|
+
if [[ ${flag_real} == 1 ]]; then
|
|
39
|
+
python3 ${assign_method} -s ${prefix}.${run_id}.sorted.het.bam \
|
|
40
|
+
-v ${prefix}.het.vcf.gz \
|
|
41
|
+
-f ${path_ref} \
|
|
42
|
+
-p ${prefix}.golden.rpt.pickle \
|
|
43
|
+
-o ${r_prefix}.${run_id}.real.bias \
|
|
44
|
+
--real
|
|
45
|
+
# indel balance plot
|
|
46
|
+
python3 ${path_module}indel_balance_plot.py -lr ${r_prefix}.${run_id}.real.bias.all \
|
|
47
|
+
-ln ${run_id} \
|
|
48
|
+
-vcf ${prefix}.het.vcf.gz \
|
|
49
|
+
-bd ${boundary} \
|
|
50
|
+
-map \
|
|
51
|
+
-out ${r_prefix}.${run_id}.real \
|
|
52
|
+
-real
|
|
53
|
+
else
|
|
54
|
+
python3 ${assign_method} -s ${prefix}.${run_id}.sorted.het.bam \
|
|
55
|
+
-v ${prefix}.het.vcf.gz \
|
|
56
|
+
-f ${path_ref} \
|
|
57
|
+
-p ${prefix}.golden.rpt.pickle \
|
|
58
|
+
-o ${r_prefix}.${run_id}.sim.bias
|
|
59
|
+
|
|
60
|
+
# report the bias categories and report
|
|
61
|
+
python3 ${path_module}golden_graph_report.py -mb ${r_prefix}.${run_id}.sim.bias.snp -out ${r_prefix}.${run_id}.snp
|
|
62
|
+
python3 ${path_module}golden_graph_report.py -mb ${r_prefix}.${run_id}.sim.bias.gap -out ${r_prefix}.${run_id}.gap
|
|
63
|
+
# plot the measures with NMB and NAB
|
|
64
|
+
python3 ${path_module}golden_graph.py -mb ${r_prefix}.${run_id}.sim.bias.snp -out ${r_prefix}.${run_id}.snp
|
|
65
|
+
python3 ${path_module}golden_graph.py -mb ${r_prefix}.${run_id}.sim.bias.gap -out ${r_prefix}.${run_id}.gap
|
|
66
|
+
# indel balance plot
|
|
67
|
+
python3 ${path_module}indel_balance_plot.py -lr ${r_prefix}.${run_id}.sim.bias.all \
|
|
68
|
+
-ln ${run_id} \
|
|
69
|
+
-vcf ${prefix}.het.vcf.gz \
|
|
70
|
+
-bd ${boundary} \
|
|
71
|
+
-map \
|
|
72
|
+
-out ${r_prefix}.${run_id}.sim
|
|
73
|
+
fi
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
path_out=$1
|
|
2
|
+
sample_id=$2
|
|
3
|
+
run_id=$3
|
|
4
|
+
target_bed=$4
|
|
5
|
+
improve_bed=$5
|
|
6
|
+
improve_lowRd=$6
|
|
7
|
+
path_module=$7
|
|
8
|
+
prefix=${path_out}/${sample_id}
|
|
9
|
+
|
|
10
|
+
bedtools subtract -a ${improve_bed} -b ${improve_lowRd} > ${prefix}.improve.goodRd.bias.bed
|
|
11
|
+
bedtools intersect -a ${target_bed} -b ${improve_lowRd} > ${prefix}.improve.skipped.bias.bed
|
|
12
|
+
|
|
13
|
+
python3 ${path_module}compare_bias_with_RD.py -lt ${target_bed} -li ${prefix}.improve.goodRd.bias.bed -lrd ${prefix}.improve.skipped.bias.bed -out ${prefix}.${run_id}.improve.bias.bed
|
|
14
|
+
#python3 check_inside_centromere.py -lr1 centromere_extend.bed -lr2 ${prefix}.${run_id}.improve.bias.bed
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
path_out=$1
|
|
2
|
+
sample_id=$2
|
|
3
|
+
run_id=$3
|
|
4
|
+
flag_real=$4
|
|
5
|
+
report_real=$5
|
|
6
|
+
report_simulation=$6
|
|
7
|
+
path_module=$7
|
|
8
|
+
prefix=${path_out}/${sample_id}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if [[ ${report_real} == 'none' ]]; then
|
|
12
|
+
report_real=${prefix}.real.${run_id}.bias
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
if [[ ${flag_real} == 1 || ${report_simulation} == 'none' ]]; then
|
|
16
|
+
echo "[Biastools] Real report bias prediction."
|
|
17
|
+
python3 ${path_module}predict_model.py -rr ${report_real} -out ${prefix}.real.${run_id}
|
|
18
|
+
else
|
|
19
|
+
echo "[Biastools] Bias prediction based on simulation report!"
|
|
20
|
+
python3 ${path_module}predict_experiment.py -sr ${report_simulation} \
|
|
21
|
+
-rr ${report_real} \
|
|
22
|
+
-out ${prefix}.sim.${run_id}
|
|
23
|
+
fi
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Wrap up python file for the biastools 3rd module
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
import argparse
|
|
6
|
+
from biastools.biastools import check_program_install, catch_assert
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
parser = argparse.ArgumentParser()
|
|
11
|
+
parser.add_argument('-o', '--out', help="Path to output directory ['out_dir'].", default="out_dir")
|
|
12
|
+
parser.add_argument('-g', '--genome', help="Path to the reference genome.")
|
|
13
|
+
parser.add_argument('-i', '--bam', help="Path to the alignment bam file, should be SORTED.")
|
|
14
|
+
parser.add_argument('-s', '--sample_id', help="Sample ID ['sample'].", default="sample")
|
|
15
|
+
parser.add_argument('-r', '--run_id', help="Run ID ['run'].", default="run")
|
|
16
|
+
# Process options
|
|
17
|
+
parser.add_argument('--scan', help='[1] Option to scan and report bias region.', action='store_true')
|
|
18
|
+
parser.add_argument('--compare_bam', help='[2] Option to generate common baseline and compare.', action='store_true')
|
|
19
|
+
parser.add_argument('--compare_rpt', help='[3] Option to directly compare two bias report.', action='store_true')
|
|
20
|
+
|
|
21
|
+
parser.add_argument('-t', '--thread', help="Number of threads to use [max].", type=int)
|
|
22
|
+
parser.add_argument('--force', help="running the program without checking prerequisite programs.", action='store_true')
|
|
23
|
+
# [1]
|
|
24
|
+
parser.add_argument('-w', '--wig', help="Generate the wig files for the three measures, VERY SLOW [False]", action='store_true')
|
|
25
|
+
parser.add_argument('-R', '--range', help="The range in the bam file targeted for analysis.")
|
|
26
|
+
# [2]
|
|
27
|
+
parser.add_argument('-i2', '--bam2', help="Path to the second alignment bam file want to compare, should be SORTED.")
|
|
28
|
+
parser.add_argument('-m', '--mpileup', help="Path to the mpileup file of the first bam file.")
|
|
29
|
+
parser.add_argument('-m2', '--mpileup2', help="Path to the mpileup file of the second bam file.")
|
|
30
|
+
# [3]
|
|
31
|
+
parser.add_argument('-b1', '--bed1', help="Path to the first bed file for comparison.")
|
|
32
|
+
parser.add_argument('-b2', '--bed2', help="Path to the second bed file for comparison.")
|
|
33
|
+
parser.add_argument('-l2', '--lowRd2', help="Path to the .lowRd.bed report of the second file.")
|
|
34
|
+
args = parser.parse_args()
|
|
35
|
+
|
|
36
|
+
path_output = args.out
|
|
37
|
+
path_ref = args.genome
|
|
38
|
+
bam_file = args.bam
|
|
39
|
+
sample_id = args.sample_id
|
|
40
|
+
run_id = args.run_id
|
|
41
|
+
|
|
42
|
+
flag_scan = args.scan
|
|
43
|
+
flag_compare_bam = args.compare_bam
|
|
44
|
+
flag_compare_rpt = args.compare_rpt
|
|
45
|
+
try:
|
|
46
|
+
assert flag_scan + flag_compare_bam + flag_compare_rpt >= 1
|
|
47
|
+
except AssertionError:
|
|
48
|
+
catch_assert(parser, "At least one of the --scan/compare_bam/compare_rpt option should be specified.")
|
|
49
|
+
|
|
50
|
+
flag_force = args.force
|
|
51
|
+
thread = args.thread
|
|
52
|
+
if thread == None:
|
|
53
|
+
if sys.platform == "darwin":
|
|
54
|
+
result = subprocess.run(["sysctl -n hw.ncpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
|
|
55
|
+
else:
|
|
56
|
+
result = subprocess.run(["nproc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
|
|
57
|
+
thread = int(result.stdout.strip())
|
|
58
|
+
flag_wig = args.wig
|
|
59
|
+
Range = args.range
|
|
60
|
+
bam_file2 = args.bam2
|
|
61
|
+
mpileup_file = args.mpileup
|
|
62
|
+
mpileup_file2 = args.mpileup2
|
|
63
|
+
bed_file1 = args.bed1
|
|
64
|
+
bed_file2 = args.bed2
|
|
65
|
+
lowRd_file2 = args.lowRd2
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Checking prerequisite programs are installed
|
|
69
|
+
if flag_force != True:
|
|
70
|
+
check_program_install(["bedtools", \
|
|
71
|
+
"samtools", \
|
|
72
|
+
"bcftools"])
|
|
73
|
+
|
|
74
|
+
# Start running
|
|
75
|
+
command = "mkdir -p " + path_output
|
|
76
|
+
subprocess.call(command, shell=True)
|
|
77
|
+
prefix = path_output + '/' + sample_id
|
|
78
|
+
path_module = os.path.dirname(__file__) + '/'
|
|
79
|
+
if flag_scan:
|
|
80
|
+
print("[Biastools] Scanning...")
|
|
81
|
+
if os.path.exists(bam_file+'.bai'):
|
|
82
|
+
pass
|
|
83
|
+
else:
|
|
84
|
+
command = ["samtools", "index", bam_file]
|
|
85
|
+
subprocess.call(command)
|
|
86
|
+
|
|
87
|
+
print("[BIASTOOLS] SAMPLE", bam_file, " as ", sample_id + ".baseline ...")
|
|
88
|
+
command = ["python3", path_module+"sample_baseline.py", "-b", bam_file, "-f", path_ref, "-o", prefix+".sample"]
|
|
89
|
+
print(' '.join(command))
|
|
90
|
+
subprocess.call(command)
|
|
91
|
+
|
|
92
|
+
if Range == None:
|
|
93
|
+
print("[BIASTOOLS] Process the whole bam file...")
|
|
94
|
+
target_bam = bam_file
|
|
95
|
+
else:
|
|
96
|
+
print("[BIASTOOLS] Extract reads from " + Range + "...")
|
|
97
|
+
target_bam = prefix + '.range.bam'
|
|
98
|
+
command = ["samtools", "view", " -h", bam_file, Range, "-o", target_bam, "-@", thread]
|
|
99
|
+
print(' '.join(command))
|
|
100
|
+
subprocess.call(command)
|
|
101
|
+
|
|
102
|
+
print("[BIASTOOLS] Format the mpileup...")
|
|
103
|
+
if os.path.exists(prefix+'.'+run_id+'.mpileup'):
|
|
104
|
+
print(prefix+'.'+run_id+'.mpileup already exist!')
|
|
105
|
+
else:
|
|
106
|
+
command = ["bcftools", "mpileup", "--count-orphans", "--annotate", "FORMAT/AD,FORMAT/DP", \
|
|
107
|
+
"-f", path_ref, \
|
|
108
|
+
"--min-BQ", "0", \
|
|
109
|
+
"--min-MQ", "0", \
|
|
110
|
+
"--threads", str(thread), target_bam, "-o", prefix+'.'+run_id+'.mpileup']
|
|
111
|
+
print(' '.join(command))
|
|
112
|
+
subprocess.call(command)
|
|
113
|
+
print("[BIASTOOLS] Scanning bias...")
|
|
114
|
+
if flag_wig:
|
|
115
|
+
command = ["python3", path_module+"scanning_bias.py", "-g", prefix+'.'+run_id+'.mpileup', "--sample", "-b", prefix+".sample.baseline", \
|
|
116
|
+
"-wig", "-o", prefix+'.'+run_id+'.scanning']
|
|
117
|
+
else:
|
|
118
|
+
command = ["python3", path_module+"scanning_bias.py", "-g", prefix+'.'+run_id+'.mpileup', "--sample", "-b", prefix+".sample.baseline", \
|
|
119
|
+
"-o", prefix+'.'+run_id+'.scanning']
|
|
120
|
+
print(' '.join(command))
|
|
121
|
+
subprocess.call(command)
|
|
122
|
+
|
|
123
|
+
if flag_compare_bam:
|
|
124
|
+
if os.path.exists(bam_file+'.bai'):
|
|
125
|
+
pass
|
|
126
|
+
else:
|
|
127
|
+
command = ["samtools", "index", bam_file]
|
|
128
|
+
subprocess.call(command)
|
|
129
|
+
if os.path.exists(bam_file2+'.bai'):
|
|
130
|
+
pass
|
|
131
|
+
else:
|
|
132
|
+
command = ["samtools", "index", bam_file2]
|
|
133
|
+
subprocess.call(command)
|
|
134
|
+
|
|
135
|
+
print("[Biastools] Generate common baseline...")
|
|
136
|
+
baseline = prefix+"."+run_id+".combine"
|
|
137
|
+
command = ["python3", path_module+"merge_baseline.py", "-b1", bam_file, "-b2", bam_file2, "-f", path_ref, "-o", baseline]
|
|
138
|
+
#print(' '.join(command))
|
|
139
|
+
subprocess.call(command)
|
|
140
|
+
command = ' '.join(["python3", path_module+"scanning_bias.py", "-g", mpileup_file, "-b", baseline+".baseline", "-o", baseline+".1.scanning", ">", prefix+"."+run_id+".log"])
|
|
141
|
+
#print(command)
|
|
142
|
+
subprocess.call(command, shell=True)
|
|
143
|
+
command = ' '.join(["python3", path_module+"scanning_bias.py", "-g", mpileup_file2, "-b", baseline+".baseline", "-o", baseline+".2.scanning", ">", prefix+"."+run_id+".log"])
|
|
144
|
+
#print(command)
|
|
145
|
+
subprocess.call(command, shell=True)
|
|
146
|
+
|
|
147
|
+
print("[Biastools] Compare two bam files with common baseline...")
|
|
148
|
+
command = ' '.join(["bash", path_module+"biastools_compare.sh", path_output, sample_id, run_id, \
|
|
149
|
+
baseline+".1.scanning.bias.bed", \
|
|
150
|
+
baseline+".2.scanning.bias.bed", \
|
|
151
|
+
baseline+".2.scanning.lowRd.bed", \
|
|
152
|
+
path_module])
|
|
153
|
+
print(command)
|
|
154
|
+
subprocess.call(command, shell=True)
|
|
155
|
+
if flag_compare_rpt:
|
|
156
|
+
print("[Biastools] Compare two bed files...")
|
|
157
|
+
command = ' '.join(["bash", path_module+"biastools_compare.sh", path_output, sample_id, run_id, bed_file1, bed_file2, lowRd_file2, path_module])
|
|
158
|
+
print(command)
|
|
159
|
+
subprocess.call(command, shell=True)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
if __name__ == "__main__":
|
|
165
|
+
main()
|