offtracker 2.7.10__zip → 2.10.1__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {offtracker-2.7.10 → offtracker-2.10.1}/PKG-INFO +64 -20
- offtracker-2.7.10/offtracker.egg-info/PKG-INFO → offtracker-2.10.1/README.md +221 -189
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/X_offplot.py +13 -2
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/X_sequence.py +113 -7
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/_version.py +9 -2
- offtracker-2.10.1/offtracker/snakefile/Snakefile_QC.smk +66 -0
- offtracker-2.10.1/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
- offtracker-2.7.10/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.1/offtracker/utility/1.1_bed2fr.py +6 -4
- offtracker-2.7.10/README.md → offtracker-2.10.1/offtracker.egg-info/PKG-INFO +233 -177
- offtracker-2.10.1/offtracker.egg-info/SOURCES.txt +28 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/scripts/offtracker_analysis.py +10 -3
- offtracker-2.10.1/scripts/offtracker_candidates.py +318 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/scripts/offtracker_config.py +28 -44
- offtracker-2.10.1/scripts/offtracker_qc.py +62 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/setup.py +5 -4
- offtracker-2.7.10/offtracker/mapping/Snakefile_offtracker +0 -245
- offtracker-2.7.10/offtracker.egg-info/SOURCES.txt +0 -26
- offtracker-2.7.10/scripts/offtracker_candidates.py +0 -307
- {offtracker-2.7.10 → offtracker-2.10.1}/LICENSE.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/MANIFEST.in +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/X_offtracker.py +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/__init__.py +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/bedGraphToBigWig +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/hg38.chrom.sizes +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/mm10.chrom.sizes +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker.egg-info/dependency_links.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker.egg-info/requires.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/offtracker.egg-info/top_level.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/scripts/offtracker_plot.py +0 -0
- {offtracker-2.7.10 → offtracker-2.10.1}/setup.cfg +0 -0
@@ -1,245 +0,0 @@
|
|
1
|
-
# 2023.08.11. adding a option for not normalizing the bw file
|
2
|
-
# 2024.01.23. add --fixedStep to bigwigCompare for not merging neighbouring bins with equal values.
|
3
|
-
|
4
|
-
configfile: "config.yaml"
|
5
|
-
|
6
|
-
_threads = config["thread"]
|
7
|
-
BinSize = str(config["binsize"])
|
8
|
-
normalize = config["normalize"]
|
9
|
-
output_dir = config["output_dir"]
|
10
|
-
nametype = config["nametype"]
|
11
|
-
suffix = config["suffix"]
|
12
|
-
name1 = nametype.replace('2','1') + '.' + suffix
|
13
|
-
name2 = nametype + '.' + suffix
|
14
|
-
|
15
|
-
import os
|
16
|
-
|
17
|
-
if normalize == "True":
|
18
|
-
rule all:
|
19
|
-
input:
|
20
|
-
expand( os.path.join(output_dir,"{sample}.fw.bed"), sample=config["sample"] ),
|
21
|
-
expand( os.path.join(output_dir,"{sample}.rv.bed"), sample=config["sample"] ),
|
22
|
-
expand( os.path.join(output_dir,"{sample}.fw.scaled.bw"), sample=config["sample"] ),
|
23
|
-
expand( os.path.join(output_dir,"{sample}.rv.scaled.bw"), sample=config["sample"] ),
|
24
|
-
expand( os.path.join(output_dir,"{sample}." + BinSize + ".add.bdg"),sample=config["sample"] ),
|
25
|
-
elif normalize == "False":
|
26
|
-
rule all:
|
27
|
-
input:
|
28
|
-
expand( os.path.join(output_dir,"{sample}.fw.bed"), sample=config["sample"] ),
|
29
|
-
expand( os.path.join(output_dir,"{sample}.rv.bed"), sample=config["sample"] ),
|
30
|
-
expand( os.path.join(output_dir,"{sample}.fw.raw.bw"), sample=config["sample"] ),
|
31
|
-
expand( os.path.join(output_dir,"{sample}.rv.raw.bw"), sample=config["sample"] ),
|
32
|
-
else:
|
33
|
-
raise ValueError('Please provide "True" or "False" for "--normalize" when running offtracker_config.py')
|
34
|
-
|
35
|
-
|
36
|
-
rule chromap:
|
37
|
-
input:
|
38
|
-
R1= lambda w: config["sample"][w.sample] + name1,
|
39
|
-
R2= lambda w: config["sample"][w.sample] + name2
|
40
|
-
threads:
|
41
|
-
_threads
|
42
|
-
params:
|
43
|
-
index=config["index"],
|
44
|
-
fasta=config["fasta"]
|
45
|
-
output:
|
46
|
-
temp(os.path.join(output_dir,"{sample}.chromapx.bed"))
|
47
|
-
shell:
|
48
|
-
"""
|
49
|
-
chromap -l 3000 --low-mem --BED --remove-pcr-duplicates \
|
50
|
-
--min-read-length 10 --allocate-multi-mappings \
|
51
|
-
-x {params.index} -r {params.fasta} -t {threads} -1 {input.R1} -2 {input.R2} -o {output}
|
52
|
-
"""
|
53
|
-
|
54
|
-
if config["blacklist"] != 'none':
|
55
|
-
rule remove_blacklist:
|
56
|
-
input:
|
57
|
-
os.path.join(output_dir,"{sample}.chromapx.bed")
|
58
|
-
threads:
|
59
|
-
_threads
|
60
|
-
params:
|
61
|
-
blacklist=config["blacklist"]
|
62
|
-
output:
|
63
|
-
temp(os.path.join(output_dir,"{sample}.filtered.bed"))
|
64
|
-
shell:
|
65
|
-
"bedtools intersect -a {input} -b {params.blacklist} -v > {output}"
|
66
|
-
|
67
|
-
rule bed2fr:
|
68
|
-
input:
|
69
|
-
os.path.join(output_dir,"{sample}.filtered.bed")
|
70
|
-
threads:
|
71
|
-
_threads
|
72
|
-
params:
|
73
|
-
dir_script=config["script_folder"]
|
74
|
-
output:
|
75
|
-
fw=os.path.join(output_dir,"{sample}.fw.bed"),
|
76
|
-
rv=os.path.join(output_dir,"{sample}.rv.bed")
|
77
|
-
shell:
|
78
|
-
"python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
|
79
|
-
else:
|
80
|
-
rule bed2fr:
|
81
|
-
input:
|
82
|
-
os.path.join(output_dir,"{sample}.chromapx.bed")
|
83
|
-
threads:
|
84
|
-
_threads
|
85
|
-
params:
|
86
|
-
dir_script=config["script_folder"]
|
87
|
-
output:
|
88
|
-
fw=os.path.join(output_dir,"{sample}.fw.bed"),
|
89
|
-
rv=os.path.join(output_dir,"{sample}.rv.bed")
|
90
|
-
shell:
|
91
|
-
"python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
|
92
|
-
|
93
|
-
rule bed2bdg_fw:
|
94
|
-
input:
|
95
|
-
os.path.join(output_dir,"{sample}.fw.bed")
|
96
|
-
threads:
|
97
|
-
_threads
|
98
|
-
params:
|
99
|
-
gl=config["genomelen"]
|
100
|
-
output:
|
101
|
-
temp(os.path.join(output_dir,"{sample}.fw.bdg"))
|
102
|
-
shell:
|
103
|
-
"bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
|
104
|
-
|
105
|
-
rule bed2bdg_rv:
|
106
|
-
input:
|
107
|
-
os.path.join(output_dir,"{sample}.rv.bed")
|
108
|
-
threads:
|
109
|
-
_threads
|
110
|
-
params:
|
111
|
-
gl=config["genomelen"]
|
112
|
-
output:
|
113
|
-
temp(os.path.join(output_dir,"{sample}.rv.bdg"))
|
114
|
-
shell:
|
115
|
-
"bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
|
116
|
-
|
117
|
-
rule bdg_sort_fw:
|
118
|
-
input:
|
119
|
-
fw=os.path.join(output_dir,"{sample}.fw.bdg")
|
120
|
-
threads:
|
121
|
-
_threads
|
122
|
-
output:
|
123
|
-
temp(os.path.join(output_dir,"{sample}.fw.sorted.bdg"))
|
124
|
-
shell:
|
125
|
-
"bedtools sort -i {input.fw} > {output}"
|
126
|
-
|
127
|
-
rule bdg_sort_rv:
|
128
|
-
input:
|
129
|
-
rv=os.path.join(output_dir,"{sample}.rv.bdg")
|
130
|
-
threads:
|
131
|
-
_threads
|
132
|
-
output:
|
133
|
-
temp(os.path.join(output_dir,"{sample}.rv.sorted.bdg"))
|
134
|
-
shell:
|
135
|
-
"bedtools sort -i {input.rv} > {output}"
|
136
|
-
|
137
|
-
if normalize == "True":
|
138
|
-
rule bdg_normalize_fw:
|
139
|
-
input:
|
140
|
-
bdg=os.path.join(output_dir,"{sample}.fw.sorted.bdg"),
|
141
|
-
bed=os.path.join(output_dir,"{sample}.fw.bed")
|
142
|
-
threads:
|
143
|
-
_threads
|
144
|
-
params:
|
145
|
-
dir_script=config["script_folder"]
|
146
|
-
output:
|
147
|
-
temp(os.path.join(output_dir,"{sample}.fw.scaled.bdg"))
|
148
|
-
shell:
|
149
|
-
"python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
|
150
|
-
|
151
|
-
rule bdg_normalize_rv:
|
152
|
-
input:
|
153
|
-
bdg=os.path.join(output_dir,"{sample}.rv.sorted.bdg"),
|
154
|
-
bed=os.path.join(output_dir,"{sample}.rv.bed")
|
155
|
-
threads:
|
156
|
-
_threads
|
157
|
-
params:
|
158
|
-
dir_script=config["script_folder"]
|
159
|
-
output:
|
160
|
-
temp(os.path.join(output_dir,"{sample}.rv.scaled.bdg"))
|
161
|
-
shell:
|
162
|
-
"python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
|
163
|
-
|
164
|
-
rule bdg2bw_fw:
|
165
|
-
input:
|
166
|
-
os.path.join(output_dir,"{sample}.fw.scaled.bdg")
|
167
|
-
threads:
|
168
|
-
_threads
|
169
|
-
params:
|
170
|
-
gl=config["genomelen"],
|
171
|
-
dir_script=config["script_folder"]
|
172
|
-
output:
|
173
|
-
os.path.join(output_dir,"{sample}.fw.scaled.bw")
|
174
|
-
shell:
|
175
|
-
"{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
|
176
|
-
|
177
|
-
rule bdg2bw_rv:
|
178
|
-
input:
|
179
|
-
os.path.join(output_dir,"{sample}.rv.scaled.bdg")
|
180
|
-
threads:
|
181
|
-
_threads
|
182
|
-
params:
|
183
|
-
gl=config["genomelen"],
|
184
|
-
dir_script=config["script_folder"]
|
185
|
-
output:
|
186
|
-
os.path.join(output_dir,"{sample}.rv.scaled.bw")
|
187
|
-
shell:
|
188
|
-
"{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
|
189
|
-
|
190
|
-
rule bwAdd:
|
191
|
-
input:
|
192
|
-
fw=os.path.join(output_dir,"{sample}.fw.scaled.bw"),
|
193
|
-
rv=os.path.join(output_dir,"{sample}.rv.scaled.bw")
|
194
|
-
threads:
|
195
|
-
_threads
|
196
|
-
output:
|
197
|
-
os.path.join(output_dir,"{sample}." + BinSize + ".add.bdg")
|
198
|
-
shell:
|
199
|
-
"""
|
200
|
-
bigwigCompare --binSize {BinSize} -p {threads} --verbose -o {output} \
|
201
|
-
--outFileFormat bedgraph --fixedStep \
|
202
|
-
--bigwig1 {input.fw} \
|
203
|
-
--bigwig2 {input.rv} \
|
204
|
-
--operation add
|
205
|
-
"""
|
206
|
-
else:
|
207
|
-
rule bdg_reverse_rv:
|
208
|
-
input:
|
209
|
-
os.path.join(output_dir,"{sample}.rv.sorted.bdg")
|
210
|
-
threads:
|
211
|
-
_threads
|
212
|
-
output:
|
213
|
-
temp(os.path.join(output_dir,"{sample}.rv.sorted_r.bdg"))
|
214
|
-
shell:
|
215
|
-
"awk -F '\t' -v OFS='\t' '{{$4=-$4; print}}' {input} > {output}"
|
216
|
-
|
217
|
-
rule bdg2bw_fw:
|
218
|
-
input:
|
219
|
-
os.path.join(output_dir,"{sample}.fw.sorted.bdg")
|
220
|
-
threads:
|
221
|
-
_threads
|
222
|
-
params:
|
223
|
-
gl=config["genomelen"],
|
224
|
-
dir_script=config["script_folder"]
|
225
|
-
output:
|
226
|
-
os.path.join(output_dir,"{sample}.fw.raw.bw")
|
227
|
-
shell:
|
228
|
-
"{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
|
229
|
-
|
230
|
-
rule bdg2bw_rv:
|
231
|
-
input:
|
232
|
-
os.path.join(output_dir,"{sample}.rv.sorted_r.bdg")
|
233
|
-
threads:
|
234
|
-
_threads
|
235
|
-
params:
|
236
|
-
gl=config["genomelen"],
|
237
|
-
dir_script=config["script_folder"]
|
238
|
-
output:
|
239
|
-
os.path.join(output_dir,"{sample}.rv.raw.bw")
|
240
|
-
shell:
|
241
|
-
"{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
@@ -1,26 +0,0 @@
|
|
1
|
-
LICENSE.txt
|
2
|
-
MANIFEST.in
|
3
|
-
README.md
|
4
|
-
setup.py
|
5
|
-
offtracker/X_offplot.py
|
6
|
-
offtracker/X_offtracker.py
|
7
|
-
offtracker/X_sequence.py
|
8
|
-
offtracker/__init__.py
|
9
|
-
offtracker/_version.py
|
10
|
-
offtracker.egg-info/PKG-INFO
|
11
|
-
offtracker.egg-info/SOURCES.txt
|
12
|
-
offtracker.egg-info/dependency_links.txt
|
13
|
-
offtracker.egg-info/requires.txt
|
14
|
-
offtracker.egg-info/top_level.txt
|
15
|
-
offtracker/mapping/1.1_bed2fr_v4.5.py
|
16
|
-
offtracker/mapping/1.3_bdg_normalize_v4.0.py
|
17
|
-
offtracker/mapping/Snakefile_offtracker
|
18
|
-
offtracker/mapping/bedGraphToBigWig
|
19
|
-
offtracker/mapping/hg38.chrom.sizes
|
20
|
-
offtracker/mapping/mm10.chrom.sizes
|
21
|
-
offtracker/mapping/offtracker_blacklist_hg38.merged.bed
|
22
|
-
offtracker/mapping/offtracker_blacklist_mm10.merged.bed
|
23
|
-
scripts/offtracker_analysis.py
|
24
|
-
scripts/offtracker_candidates.py
|
25
|
-
scripts/offtracker_config.py
|
26
|
-
scripts/offtracker_plot.py
|
@@ -1,307 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
# 2023.10.27. v2.0: 2.0以target_location midpoint为中心,因此取消 pct 计算
|
5
|
-
# 2023.12.06. v2.1: 2.1增加 cleavage_site 推测, 修正 deletion 错位, 以 cleavage_site 为中心
|
6
|
-
import os,sys,re,time
|
7
|
-
from itertools import product
|
8
|
-
|
9
|
-
if sys.version_info < (3,0):
|
10
|
-
import platform
|
11
|
-
raise Exception(f'python3 is needed, while running {platform.python_version()} now')
|
12
|
-
|
13
|
-
import offtracker
|
14
|
-
import offtracker.X_sequence as xseq
|
15
|
-
script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
|
16
|
-
script_folder= os.path.join(script_dir, 'mapping')
|
17
|
-
|
18
|
-
import argparse
|
19
|
-
import pandas as pd
|
20
|
-
import pybedtools
|
21
|
-
import multiprocessing as mp
|
22
|
-
from Bio.Blast.Applications import NcbiblastnCommandline
|
23
|
-
|
24
|
-
def main():
|
25
|
-
parser = argparse.ArgumentParser()
|
26
|
-
parser.description='Generate candidate regions by sgRNA sequence'
|
27
|
-
parser.add_argument('--sgrna' , type=str, required=True, help='sgRNA sequence without PAM' )
|
28
|
-
parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
|
29
|
-
parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
|
30
|
-
parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
|
31
|
-
parser.add_argument('-b','--blastdb', type=str, required=True, help='blast database')
|
32
|
-
parser.add_argument('-o','--outdir' , type=str, required=True, help='The output folder')
|
33
|
-
parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
|
34
|
-
parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
|
35
|
-
parser.add_argument('--quick_mode' , action='store_true', help='BLAST faster but less candidates.')
|
36
|
-
|
37
|
-
args = parser.parse_args()
|
38
|
-
|
39
|
-
|
40
|
-
if (args.genome == 'hg38') or (args.genome == 'mm10'):
|
41
|
-
dir_chrom_sizes = os.path.join(script_folder, f'{args.genome}.chrom.sizes')
|
42
|
-
else:
|
43
|
-
dir_chrom_sizes = args.genome
|
44
|
-
|
45
|
-
sgRNA_name = args.name
|
46
|
-
sgRNA_seq = args.sgrna
|
47
|
-
PAM = args.pam
|
48
|
-
n_threads = args.thread
|
49
|
-
dir_output = args.outdir
|
50
|
-
if not os.path.exists(dir_output):
|
51
|
-
os.makedirs(dir_output)
|
52
|
-
dir_ref_fa = args.ref
|
53
|
-
blast_db = args.blastdb
|
54
|
-
quick_mode = args.quick_mode
|
55
|
-
|
56
|
-
# parameters for alignment
|
57
|
-
half_width = 100
|
58
|
-
pct_params = 1.0
|
59
|
-
frag_len= half_width*2
|
60
|
-
dir_df_candidate = os.path.join(dir_output, f'df_candidate_{sgRNA_name}.csv')
|
61
|
-
|
62
|
-
|
63
|
-
sgRNA_seq = sgRNA_seq.upper()
|
64
|
-
PAM = PAM.upper()
|
65
|
-
dir_sgRNA_fasta = os.path.join(dir_output, f'{sgRNA_name}_PAM.fasta')
|
66
|
-
dir_sgRNA_blast = os.path.join(dir_output, f'{sgRNA_name}_PAM.blast')
|
67
|
-
dir_sgRNA_bed = os.path.join(dir_output, f'{sgRNA_name}_PAM.bed')
|
68
|
-
|
69
|
-
|
70
|
-
possible_sgRNA_PAM = list(product([sgRNA_seq],xseq.possible_seq(PAM)))
|
71
|
-
possible_sgRNA_PAM = [''.join(combination) for combination in possible_sgRNA_PAM]
|
72
|
-
n_seq = len(possible_sgRNA_PAM)
|
73
|
-
|
74
|
-
ID = pd.Series(['seq']*n_seq) + pd.Series(range(1,n_seq+1)).astype(str)
|
75
|
-
df_sgRNA_PAM = pd.DataFrame({'ID':ID,'sequence':possible_sgRNA_PAM})
|
76
|
-
xseq.write_fasta(df_sgRNA_PAM, dir_sgRNA_fasta)
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
#########
|
81
|
-
# BLAST #
|
82
|
-
#########
|
83
|
-
if os.path.isfile(dir_sgRNA_blast):
|
84
|
-
print(f'{dir_sgRNA_blast} exists, skipped.')
|
85
|
-
else:
|
86
|
-
if quick_mode:
|
87
|
-
print('Using quick mode for BLAST')
|
88
|
-
blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
|
89
|
-
db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
|
90
|
-
gapopen=4, gapextend=2, reward=2, word_size=5, dust='no', soft_masking=False)
|
91
|
-
else:
|
92
|
-
blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
|
93
|
-
db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
|
94
|
-
gapopen=4, gapextend=2, reward=2, word_size=4, dust='no', soft_masking=False)
|
95
|
-
print(f'BLAST for candidate off-target sites of {sgRNA_name}.')
|
96
|
-
blastx_cline()
|
97
|
-
print(f'BLAST finished.')
|
98
|
-
|
99
|
-
##############
|
100
|
-
# Output bed #
|
101
|
-
##############
|
102
|
-
|
103
|
-
blast_regions = pd.read_csv(dir_sgRNA_blast, sep='\t',header=None)
|
104
|
-
blast_regions.columns = ['query acc.','chr','% identity','alignment length','mismatches','gap opens','q. start','q. end','st','ed','evalue','bit score']
|
105
|
-
blast_regions = blast_regions[blast_regions.evalue<10000]
|
106
|
-
|
107
|
-
# reverse strand
|
108
|
-
blast_regions['reverse'] = (blast_regions['st']>blast_regions['ed']).astype(int)
|
109
|
-
blast_regions_f = blast_regions[blast_regions.reverse==0].copy()
|
110
|
-
blast_regions_r = blast_regions[blast_regions.reverse==1].copy()
|
111
|
-
temp = blast_regions_r['st'].copy()
|
112
|
-
blast_regions_r['st'] = blast_regions_r['ed']
|
113
|
-
blast_regions_r['ed'] = temp
|
114
|
-
blast_regions = pd.concat([blast_regions_f, blast_regions_r])
|
115
|
-
# sort and add location
|
116
|
-
blast_regions = blast_regions.sort_values('evalue').reset_index(drop=True)
|
117
|
-
blast_regions['location']=blast_regions['chr'].str[:] + ':' + blast_regions['st'].astype(str).str[:] + '-' + blast_regions['ed'].astype(str).str[:]
|
118
|
-
blast_regions = blast_regions.drop_duplicates(subset='location').copy()
|
119
|
-
|
120
|
-
# alignment length 筛选
|
121
|
-
len_sgRNA=len(sgRNA_seq)
|
122
|
-
min_len = len_sgRNA-8
|
123
|
-
blast_regions = blast_regions[blast_regions['alignment length']>=min_len].copy().reset_index(drop=True)
|
124
|
-
blast_regions = blast_regions.reindex(columns = ['chr', 'st', 'ed' , 'query acc.', '% identity', 'alignment length', 'mismatches',
|
125
|
-
'gap opens', 'q. start', 'q. end', 'evalue', 'bit score', 'reverse', 'location'] )
|
126
|
-
|
127
|
-
# 输出 bed 用于后续 alignment score 计算
|
128
|
-
blast_regions_bed = blast_regions[['chr','st','ed']]
|
129
|
-
xseq.write_bed(blast_regions_bed, dir_sgRNA_bed)
|
130
|
-
# 对 bed 进行排序但不合并
|
131
|
-
a = pybedtools.BedTool(dir_sgRNA_bed)
|
132
|
-
a.sort(g=dir_chrom_sizes).saveas( dir_sgRNA_bed )
|
133
|
-
print(f'Output {sgRNA_name}_PAM.bed')
|
134
|
-
|
135
|
-
|
136
|
-
###################
|
137
|
-
# alignment score #
|
138
|
-
###################
|
139
|
-
if os.path.isfile(dir_df_candidate):
|
140
|
-
print(f'{dir_df_candidate} exists, skipped.')
|
141
|
-
else:
|
142
|
-
#########
|
143
|
-
# 读取 blast bed
|
144
|
-
#########
|
145
|
-
common_chr = pd.Series(['chr']*23).str[:] + pd.Series(range(23)).astype(str).str[:]
|
146
|
-
common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY'])]).to_numpy()
|
147
|
-
|
148
|
-
bed_short = xseq.X_readbed(dir_sgRNA_bed)
|
149
|
-
bed_short = bed_short[bed_short['chr'].isin(common_chr)].copy()
|
150
|
-
bed_short['midpoint'] = ((bed_short['st'] + bed_short['ed'])/2).astype(int)
|
151
|
-
bed_short['st'] = bed_short['midpoint'] - half_width
|
152
|
-
bed_short['ed'] = bed_short['midpoint'] + half_width
|
153
|
-
bed_short.loc[bed_short['st']<0,'st']=0
|
154
|
-
bed_short = bed_short.drop_duplicates()
|
155
|
-
|
156
|
-
#########
|
157
|
-
# 根据 bed_f 位点 ed 前后 half_width 取基因组序列
|
158
|
-
#########
|
159
|
-
|
160
|
-
temp_bed = os.path.join(dir_output, 'temp.bed')
|
161
|
-
xseq.write_bed(bed_short.iloc[:,:3], temp_bed)
|
162
|
-
a = pybedtools.BedTool(temp_bed)
|
163
|
-
fasta = pybedtools.example_filename(dir_ref_fa)
|
164
|
-
a = a.sequence(fi=fasta)
|
165
|
-
with open(a.seqfn) as f:
|
166
|
-
fasta = {}
|
167
|
-
for line in f:
|
168
|
-
line = line.strip() # 去除末尾换行符
|
169
|
-
if line[0] == '>':
|
170
|
-
header = line[1:]
|
171
|
-
else:
|
172
|
-
sequence = line
|
173
|
-
fasta[header] = fasta.get(header,'') + sequence
|
174
|
-
|
175
|
-
# pybedtools 得到位置 chrA:X-Y 时,X数字会往左多1bp
|
176
|
-
|
177
|
-
#########
|
178
|
-
# local alignment
|
179
|
-
#########
|
180
|
-
DNA_matrix = {('A','A'): 2, ('A','T'):0.01, ('A','C'):0.01, ('A','G'):0.01, ('A','N'):0.01,
|
181
|
-
('T','T'): 2, ('T','A'):0.01, ('T','C'):0.01, ('T','G'):0.01, ('T','N'):0.01,
|
182
|
-
('G','G'): 2, ('G','A'):0.01, ('G','C'):0.01, ('G','T'):0.01, ('G','N'):0.01,
|
183
|
-
('C','C'): 2, ('C','A'):0.01, ('C','G'):0.01, ('C','T'):0.01, ('C','N'):0.01,
|
184
|
-
('N','N'): 2, ('N','C'):2, ('N','A'): 2, ('N','G'): 2, ('N','T'): 2}
|
185
|
-
mismatch_score = 0.01
|
186
|
-
# 添加 PAM
|
187
|
-
sgRNA_PAM_fw = sgRNA_seq + PAM
|
188
|
-
sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
|
189
|
-
|
190
|
-
list_args_fw=[]
|
191
|
-
list_args_rv=[]
|
192
|
-
for a_key in fasta.keys():
|
193
|
-
seq = re.sub('[^ATCG]','N',fasta[a_key])
|
194
|
-
list_args_fw.append( [a_key, sgRNA_PAM_fw, seq, frag_len, DNA_matrix, mismatch_score] )
|
195
|
-
list_args_rv.append( [a_key, sgRNA_PAM_rv, seq, frag_len, DNA_matrix, mismatch_score] )
|
196
|
-
st = time.time()
|
197
|
-
with mp.Pool(n_threads) as p:
|
198
|
-
list_align_forward = p.starmap(xseq.sgRNA_alignment, list_args_fw)
|
199
|
-
ed = time.time()
|
200
|
-
print('align_forward:{:.2f}'.format(ed-st))
|
201
|
-
st = time.time()
|
202
|
-
with mp.Pool(n_threads) as p:
|
203
|
-
list_align_reverse = p.starmap(xseq.sgRNA_alignment, list_args_rv)
|
204
|
-
ed = time.time()
|
205
|
-
print('align_reverse:{:.2f}'.format(ed-st))
|
206
|
-
#
|
207
|
-
df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score','fw_pct','fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
|
208
|
-
df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score','rv_pct','rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
|
209
|
-
df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
|
210
|
-
df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
|
211
|
-
df_candidate['location'] = fasta.keys()
|
212
|
-
df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
|
213
|
-
#df_candidate['fw_score_2'] = df_candidate['fw_score']*(pct_params-df_candidate['fw_pct'].abs())
|
214
|
-
#df_candidate['rv_score_2'] = df_candidate['rv_score']*(pct_params-df_candidate['rv_pct'].abs())
|
215
|
-
#df_candidate['best_seq_score'] = df_candidate[['fw_score_2', 'rv_score_2']].max(axis=1)
|
216
|
-
#df_candidate['best_strand'] = df_candidate[['fw_score_2', 'rv_score_2']].idxmax(axis='columns').replace({'fw_score_2':'+', 'rv_score_2':'-'})
|
217
|
-
#df_candidate.loc[df_candidate['fw_score_2']==df_candidate['rv_score_2'],'best_strand']='equal_score'
|
218
|
-
df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
|
219
|
-
df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
|
220
|
-
df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
|
221
|
-
|
222
|
-
# GG check
|
223
|
-
# 2023.12.05 增加 cleavage_site 推测
|
224
|
-
list_best_target = []
|
225
|
-
list_best_location = []
|
226
|
-
list_cleavage_site = []
|
227
|
-
list_delete = []
|
228
|
-
list_insert = []
|
229
|
-
list_mismat = []
|
230
|
-
list_GG = []
|
231
|
-
for a_row in df_candidate.iterrows():
|
232
|
-
if a_row[1]['best_strand']=='+':
|
233
|
-
list_best_target.append(a_row[1]['fw_target'])
|
234
|
-
list_best_location.append(a_row[1]['fw_location'])
|
235
|
-
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
236
|
-
list_delete.append(a_row[1]['fw_deletion'])
|
237
|
-
list_insert.append(a_row[1]['fw_insertion'])
|
238
|
-
list_mismat.append(a_row[1]['fw_mismatch'])
|
239
|
-
if a_row[1]['fw_target'][-2:]=='GG':
|
240
|
-
list_GG.append('OK')
|
241
|
-
else:
|
242
|
-
list_GG.append('NO')
|
243
|
-
elif a_row[1]['best_strand']=='-':
|
244
|
-
list_best_target.append(a_row[1]['rv_target'])
|
245
|
-
list_best_location.append(a_row[1]['rv_location'])
|
246
|
-
list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
|
247
|
-
list_delete.append(a_row[1]['rv_deletion'])
|
248
|
-
list_insert.append(a_row[1]['rv_insertion'])
|
249
|
-
list_mismat.append(a_row[1]['rv_mismatch'])
|
250
|
-
if a_row[1]['rv_target'][-2:]=='GG':
|
251
|
-
list_GG.append('OK')
|
252
|
-
else:
|
253
|
-
list_GG.append('NO')
|
254
|
-
else:
|
255
|
-
if a_row[1]['fw_target'][-2:]=='GG':
|
256
|
-
list_best_target.append(a_row[1]['fw_target'])
|
257
|
-
list_best_location.append(a_row[1]['fw_location'])
|
258
|
-
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
259
|
-
list_delete.append(a_row[1]['fw_deletion'])
|
260
|
-
list_insert.append(a_row[1]['fw_insertion'])
|
261
|
-
list_mismat.append(a_row[1]['fw_mismatch'])
|
262
|
-
list_GG.append('OK_same_score')
|
263
|
-
# 发现没有 GG 则看 RC
|
264
|
-
elif a_row[1]['rv_target'][-2:]=='GG':
|
265
|
-
list_best_target.append(a_row[1]['rv_target'])
|
266
|
-
list_best_location.append(a_row[1]['rv_location'])
|
267
|
-
list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
|
268
|
-
list_delete.append(a_row[1]['rv_deletion'])
|
269
|
-
list_insert.append(a_row[1]['rv_insertion'])
|
270
|
-
list_mismat.append(a_row[1]['rv_mismatch'])
|
271
|
-
list_GG.append('OK_same_score')
|
272
|
-
else:
|
273
|
-
list_best_target.append(a_row[1]['fw_target'])
|
274
|
-
list_best_location.append(a_row[1]['fw_location'])
|
275
|
-
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
276
|
-
list_delete.append(a_row[1]['fw_deletion'])
|
277
|
-
list_insert.append(a_row[1]['fw_insertion'])
|
278
|
-
list_mismat.append(a_row[1]['fw_mismatch'])
|
279
|
-
list_GG.append('NO_same_score')
|
280
|
-
# 记入 df_candidate
|
281
|
-
df_candidate['deletion'] = list_delete
|
282
|
-
df_candidate['insertion'] = list_insert
|
283
|
-
df_candidate['mismatch'] = list_mismat
|
284
|
-
df_candidate['GG'] = list_GG
|
285
|
-
df_candidate['best_target'] = list_best_target
|
286
|
-
df_candidate['target_location'] = list_best_location
|
287
|
-
df_candidate['cleavage_site'] = list_cleavage_site
|
288
|
-
|
289
|
-
# 2.0 更新一下格式
|
290
|
-
df_candidate = df_candidate.drop_duplicates(subset=['target_location']).reset_index(drop=True)
|
291
|
-
df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate],axis=1)
|
292
|
-
# df_candidate['midpoint'] = ((df_candidate['ed'] + df_candidate['st'])/2).astype(int)
|
293
|
-
df_candidate = xseq.add_ID(df_candidate, midpoint='cleavage_site')
|
294
|
-
|
295
|
-
df_candidate.to_csv(dir_df_candidate)
|
296
|
-
print(f'Output df_candidate_{sgRNA_name}.csv')
|
297
|
-
os.remove(temp_bed)
|
298
|
-
|
299
|
-
return 'Done!'
|
300
|
-
|
301
|
-
|
302
|
-
if __name__ == '__main__' :
|
303
|
-
result = main()
|
304
|
-
print(result)
|
305
|
-
|
306
|
-
|
307
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/bedGraphToBigWig
RENAMED
File without changes
|
{offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/hg38.chrom.sizes
RENAMED
File without changes
|
{offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/mm10.chrom.sizes
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|