offtracker 2.7.10__zip → 2.10.0__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {offtracker-2.7.10/offtracker.egg-info → offtracker-2.10.0}/PKG-INFO +62 -18
- {offtracker-2.7.10 → offtracker-2.10.0}/README.md +62 -18
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_offplot.py +13 -2
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_sequence.py +113 -7
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/_version.py +8 -2
- offtracker-2.10.0/offtracker/snakefile/Snakefile_QC.smk +66 -0
- offtracker-2.10.0/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
- offtracker-2.7.10/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.0/offtracker/utility/1.1_bed2fr.py +6 -4
- {offtracker-2.7.10 → offtracker-2.10.0/offtracker.egg-info}/PKG-INFO +62 -18
- offtracker-2.10.0/offtracker.egg-info/SOURCES.txt +28 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_analysis.py +10 -3
- offtracker-2.10.0/scripts/offtracker_candidates.py +318 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_config.py +28 -44
- offtracker-2.10.0/scripts/offtracker_qc.py +62 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/setup.py +5 -4
- offtracker-2.7.10/offtracker/mapping/Snakefile_offtracker +0 -245
- offtracker-2.7.10/offtracker.egg-info/SOURCES.txt +0 -26
- offtracker-2.7.10/scripts/offtracker_candidates.py +0 -307
- {offtracker-2.7.10 → offtracker-2.10.0}/LICENSE.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/MANIFEST.in +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_offtracker.py +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/__init__.py +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/bedGraphToBigWig +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/hg38.chrom.sizes +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/mm10.chrom.sizes +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
- {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/dependency_links.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/requires.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/top_level.txt +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_plot.py +0 -0
- {offtracker-2.7.10 → offtracker-2.10.0}/setup.cfg +0 -0
@@ -0,0 +1,318 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
# 2023.10.27. v2.0: 2.0以target_location midpoint为中心,因此取消 pct 计算
|
5
|
+
# 2023.12.06. v2.1: 2.1增加 cleavage_site 推测, 修正 deletion 错位, 以 cleavage_site 为中心
|
6
|
+
# 2025.04.25. 修正大小写问题
|
7
|
+
# 2025.06.11. 调整跳过已存在的candidates的代码顺序
|
8
|
+
|
9
|
+
import os,sys,re,time
|
10
|
+
from itertools import product, permutations
|
11
|
+
|
12
|
+
if sys.version_info < (3,0):
|
13
|
+
import platform
|
14
|
+
raise Exception(f'python3 is needed, while running {platform.python_version()} now')
|
15
|
+
|
16
|
+
import offtracker
|
17
|
+
import offtracker.X_sequence as xseq
|
18
|
+
script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
|
19
|
+
script_folder= os.path.join(script_dir, 'mapping')
|
20
|
+
|
21
|
+
import argparse
|
22
|
+
import pandas as pd
|
23
|
+
import pybedtools
|
24
|
+
import multiprocessing as mp
|
25
|
+
from Bio.Blast.Applications import NcbiblastnCommandline
|
26
|
+
|
27
|
+
def main():
|
28
|
+
parser = argparse.ArgumentParser()
|
29
|
+
parser.description='Generate candidate regions by sgRNA sequence'
|
30
|
+
parser.add_argument('--sgrna' , type=str, required=True, help='One sgRNA sequence without PAM' )
|
31
|
+
parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
|
32
|
+
parser.add_argument('--pam_location', type=str, default='downstream', help='Upstream or downstream, default is downstream (Cas9)' )
|
33
|
+
parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
|
34
|
+
parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
|
35
|
+
parser.add_argument('-b','--blastdb', type=str, required=True, help='blast database')
|
36
|
+
parser.add_argument('-o','--outdir' , type=str, required=True, help='The output folder')
|
37
|
+
parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
|
38
|
+
parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
|
39
|
+
parser.add_argument('--quick_mode' , action='store_true', help='BLAST faster but less candidates.')
|
40
|
+
|
41
|
+
args = parser.parse_args()
|
42
|
+
|
43
|
+
|
44
|
+
if (args.genome == 'hg38') or (args.genome == 'mm10'):
|
45
|
+
dir_chrom_sizes = os.path.join(script_folder, f'{args.genome}.chrom.sizes')
|
46
|
+
else:
|
47
|
+
dir_chrom_sizes = args.genome
|
48
|
+
|
49
|
+
sgRNA_name = args.name
|
50
|
+
sgRNA_seq = args.sgrna
|
51
|
+
PAM = args.pam
|
52
|
+
PAM_loc = args.pam_location.lower()
|
53
|
+
n_threads = args.thread
|
54
|
+
dir_output = args.outdir
|
55
|
+
if not os.path.exists(dir_output):
|
56
|
+
os.makedirs(dir_output)
|
57
|
+
dir_ref_fa = args.ref
|
58
|
+
blast_db = args.blastdb
|
59
|
+
quick_mode = args.quick_mode
|
60
|
+
|
61
|
+
# parameters for alignment
|
62
|
+
half_width = 100
|
63
|
+
pct_params = 1.0
|
64
|
+
frag_len= half_width*2
|
65
|
+
dir_df_candidate = os.path.join(dir_output, f'df_candidate_{sgRNA_name}.csv')
|
66
|
+
if os.path.isfile(dir_df_candidate):
|
67
|
+
print(f'{dir_df_candidate} exists, skipped.')
|
68
|
+
return 'skipped'
|
69
|
+
|
70
|
+
sgRNA_seq = sgRNA_seq.upper()
|
71
|
+
PAM = PAM.upper()
|
72
|
+
dir_sgRNA_fasta = os.path.join(dir_output, f'{sgRNA_name}_PAM.fasta')
|
73
|
+
dir_sgRNA_blast = os.path.join(dir_output, f'{sgRNA_name}_PAM.blast')
|
74
|
+
dir_sgRNA_bed = os.path.join(dir_output, f'{sgRNA_name}_PAM.bed')
|
75
|
+
|
76
|
+
if PAM_loc == 'downstream':
|
77
|
+
possible_sgRNA_PAM = list(product([sgRNA_seq],xseq.possible_seq(PAM)))
|
78
|
+
elif PAM_loc == 'upstream':
|
79
|
+
possible_sgRNA_PAM = list(product(xseq.possible_seq(PAM),[sgRNA_seq]))
|
80
|
+
else:
|
81
|
+
raise Exception(f'PAM_location should be "upstream" or "downstream", while {PAM_loc} is given.')
|
82
|
+
possible_sgRNA_PAM = [''.join(combination) for combination in possible_sgRNA_PAM]
|
83
|
+
n_seq = len(possible_sgRNA_PAM)
|
84
|
+
|
85
|
+
ID = pd.Series(['seq']*n_seq) + pd.Series(range(1,n_seq+1)).astype(str)
|
86
|
+
df_sgRNA_PAM = pd.DataFrame({'ID':ID,'sequence':possible_sgRNA_PAM})
|
87
|
+
xseq.write_fasta(df_sgRNA_PAM, dir_sgRNA_fasta)
|
88
|
+
|
89
|
+
#########
|
90
|
+
# BLAST #
|
91
|
+
#########
|
92
|
+
if os.path.isfile(dir_sgRNA_blast):
|
93
|
+
print(f'{dir_sgRNA_blast} exists, skipped.')
|
94
|
+
else:
|
95
|
+
if quick_mode:
|
96
|
+
print('Using quick mode for BLAST')
|
97
|
+
blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
|
98
|
+
db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
|
99
|
+
gapopen=4, gapextend=2, reward=2, word_size=5, dust='no', soft_masking=False)
|
100
|
+
else:
|
101
|
+
blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
|
102
|
+
db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
|
103
|
+
gapopen=4, gapextend=2, reward=2, word_size=4, dust='no', soft_masking=False)
|
104
|
+
print(f'BLAST for candidate off-target sites of {sgRNA_name}.')
|
105
|
+
blastx_cline()
|
106
|
+
print(f'BLAST finished.')
|
107
|
+
|
108
|
+
##############
|
109
|
+
# Output bed #
|
110
|
+
##############
|
111
|
+
|
112
|
+
blast_regions = pd.read_csv(dir_sgRNA_blast, sep='\t',header=None)
|
113
|
+
blast_regions.columns = ['query acc.','chr','% identity','alignment length','mismatches','gap opens','q. start','q. end','st','ed','evalue','bit score']
|
114
|
+
blast_regions = blast_regions[blast_regions.evalue<10000]
|
115
|
+
|
116
|
+
# reverse strand
|
117
|
+
blast_regions['reverse'] = (blast_regions['st']>blast_regions['ed']).astype(int)
|
118
|
+
blast_regions_f = blast_regions[blast_regions.reverse==0].copy()
|
119
|
+
blast_regions_r = blast_regions[blast_regions.reverse==1].copy()
|
120
|
+
temp = blast_regions_r['st'].copy()
|
121
|
+
blast_regions_r['st'] = blast_regions_r['ed']
|
122
|
+
blast_regions_r['ed'] = temp
|
123
|
+
blast_regions = pd.concat([blast_regions_f, blast_regions_r])
|
124
|
+
# sort and add location
|
125
|
+
blast_regions = blast_regions.sort_values('evalue').reset_index(drop=True)
|
126
|
+
blast_regions['location']=blast_regions['chr'].str[:] + ':' + blast_regions['st'].astype(str).str[:] + '-' + blast_regions['ed'].astype(str).str[:]
|
127
|
+
blast_regions = blast_regions.drop_duplicates(subset='location').copy()
|
128
|
+
|
129
|
+
# alignment length 筛选
|
130
|
+
len_sgRNA=len(sgRNA_seq)
|
131
|
+
min_len = len_sgRNA-8
|
132
|
+
blast_regions = blast_regions[blast_regions['alignment length']>=min_len].copy().reset_index(drop=True)
|
133
|
+
blast_regions = blast_regions.reindex(columns = ['chr', 'st', 'ed' , 'query acc.', '% identity', 'alignment length', 'mismatches',
|
134
|
+
'gap opens', 'q. start', 'q. end', 'evalue', 'bit score', 'reverse', 'location'] )
|
135
|
+
|
136
|
+
# 输出 bed 用于后续 alignment score 计算
|
137
|
+
blast_regions_bed = blast_regions[['chr','st','ed']]
|
138
|
+
xseq.write_bed(blast_regions_bed, dir_sgRNA_bed)
|
139
|
+
# 对 bed 进行排序但不合并
|
140
|
+
a = pybedtools.BedTool(dir_sgRNA_bed)
|
141
|
+
a.sort(g=dir_chrom_sizes).saveas( dir_sgRNA_bed )
|
142
|
+
print(f'Output {sgRNA_name}_PAM.bed')
|
143
|
+
|
144
|
+
|
145
|
+
###################
|
146
|
+
# alignment score #
|
147
|
+
###################
|
148
|
+
|
149
|
+
#########
|
150
|
+
# 读取 blast bed
|
151
|
+
#########
|
152
|
+
common_chr = pd.Series(['chr']*23).str[:] + pd.Series(range(23)).astype(str).str[:]
|
153
|
+
common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY'])]).to_numpy()
|
154
|
+
|
155
|
+
bed_short = xseq.X_readbed(dir_sgRNA_bed)
|
156
|
+
bed_short = bed_short[bed_short['chr'].isin(common_chr)].copy()
|
157
|
+
bed_short['midpoint'] = ((bed_short['st'] + bed_short['ed'])/2).astype(int)
|
158
|
+
bed_short['st'] = bed_short['midpoint'] - half_width
|
159
|
+
bed_short['ed'] = bed_short['midpoint'] + half_width
|
160
|
+
bed_short.loc[bed_short['st']<0,'st']=0
|
161
|
+
bed_short = bed_short.drop_duplicates()
|
162
|
+
|
163
|
+
#########
|
164
|
+
# 根据 bed_f 位点 ed 前后 half_width 取基因组序列
|
165
|
+
#########
|
166
|
+
|
167
|
+
temp_bed = os.path.join(dir_output, 'temp.bed')
|
168
|
+
xseq.write_bed(bed_short.iloc[:,:3], temp_bed)
|
169
|
+
a = pybedtools.BedTool(temp_bed)
|
170
|
+
fasta = pybedtools.example_filename(dir_ref_fa)
|
171
|
+
a = a.sequence(fi=fasta)
|
172
|
+
with open(a.seqfn) as f:
|
173
|
+
fasta = {}
|
174
|
+
for line in f:
|
175
|
+
line = line.strip() # 去除末尾换行符
|
176
|
+
if line[0] == '>':
|
177
|
+
header = line[1:]
|
178
|
+
else:
|
179
|
+
sequence = line
|
180
|
+
fasta[header] = fasta.get(header,'') + sequence
|
181
|
+
|
182
|
+
# pybedtools 得到位置 chrA:X-Y 时,X数字会往左多1bp
|
183
|
+
|
184
|
+
#########
|
185
|
+
# local alignment
|
186
|
+
#########
|
187
|
+
# 生成 DNA_matrix
|
188
|
+
mismatch_score = 0.01
|
189
|
+
base_codes = list(xseq.ambiguous_nt.keys())
|
190
|
+
all_base_pairs = list(permutations(base_codes,2)) + [(x,x) for x in base_codes]
|
191
|
+
DNA_matrix = {x : xseq.get_base_score(*x, mismatch_score=mismatch_score) for x in all_base_pairs}
|
192
|
+
# 添加 PAM
|
193
|
+
if PAM_loc == 'downstream':
|
194
|
+
sgRNA_PAM_fw = sgRNA_seq + PAM
|
195
|
+
else:
|
196
|
+
sgRNA_PAM_fw = PAM + sgRNA_seq
|
197
|
+
|
198
|
+
sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
|
199
|
+
|
200
|
+
list_args_fw=[]
|
201
|
+
list_args_rv=[]
|
202
|
+
for a_key, a_seq in fasta.items():
|
203
|
+
# 2025.04.25 修正大小写问题
|
204
|
+
a_seq = re.sub('[^ATCG]','N',a_seq.upper())
|
205
|
+
list_args_fw.append( [a_key, sgRNA_PAM_fw, a_seq, frag_len, DNA_matrix, mismatch_score] )
|
206
|
+
list_args_rv.append( [a_key, sgRNA_PAM_rv, a_seq, frag_len, DNA_matrix, mismatch_score] )
|
207
|
+
st = time.time()
|
208
|
+
with mp.Pool(n_threads) as p:
|
209
|
+
list_align_forward = p.starmap(xseq.sgRNA_alignment, list_args_fw)
|
210
|
+
ed = time.time()
|
211
|
+
print('align_forward:{:.2f}'.format(ed-st))
|
212
|
+
st = time.time()
|
213
|
+
with mp.Pool(n_threads) as p:
|
214
|
+
list_align_reverse = p.starmap(xseq.sgRNA_alignment, list_args_rv)
|
215
|
+
ed = time.time()
|
216
|
+
print('align_reverse:{:.2f}'.format(ed-st))
|
217
|
+
#
|
218
|
+
df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score','fw_pct','fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
|
219
|
+
df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score','rv_pct','rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
|
220
|
+
df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
|
221
|
+
df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
|
222
|
+
df_candidate['location'] = fasta.keys()
|
223
|
+
df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
|
224
|
+
#df_candidate['fw_score_2'] = df_candidate['fw_score']*(pct_params-df_candidate['fw_pct'].abs())
|
225
|
+
#df_candidate['rv_score_2'] = df_candidate['rv_score']*(pct_params-df_candidate['rv_pct'].abs())
|
226
|
+
#df_candidate['best_seq_score'] = df_candidate[['fw_score_2', 'rv_score_2']].max(axis=1)
|
227
|
+
#df_candidate['best_strand'] = df_candidate[['fw_score_2', 'rv_score_2']].idxmax(axis='columns').replace({'fw_score_2':'+', 'rv_score_2':'-'})
|
228
|
+
#df_candidate.loc[df_candidate['fw_score_2']==df_candidate['rv_score_2'],'best_strand']='equal_score'
|
229
|
+
df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
|
230
|
+
df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
|
231
|
+
df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
|
232
|
+
|
233
|
+
# GG check
|
234
|
+
# 2023.12.05 增加 cleavage_site 推测
|
235
|
+
list_best_target = []
|
236
|
+
list_best_location = []
|
237
|
+
list_cleavage_site = []
|
238
|
+
list_delete = []
|
239
|
+
list_insert = []
|
240
|
+
list_mismat = []
|
241
|
+
list_GG = []
|
242
|
+
for a_row in df_candidate.iterrows():
|
243
|
+
if a_row[1]['best_strand']=='+':
|
244
|
+
list_best_target.append(a_row[1]['fw_target'])
|
245
|
+
list_best_location.append(a_row[1]['fw_location'])
|
246
|
+
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
247
|
+
list_delete.append(a_row[1]['fw_deletion'])
|
248
|
+
list_insert.append(a_row[1]['fw_insertion'])
|
249
|
+
list_mismat.append(a_row[1]['fw_mismatch'])
|
250
|
+
if a_row[1]['fw_target'][-2:]=='GG':
|
251
|
+
list_GG.append('OK')
|
252
|
+
else:
|
253
|
+
list_GG.append('NO')
|
254
|
+
elif a_row[1]['best_strand']=='-':
|
255
|
+
list_best_target.append(a_row[1]['rv_target'])
|
256
|
+
list_best_location.append(a_row[1]['rv_location'])
|
257
|
+
list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
|
258
|
+
list_delete.append(a_row[1]['rv_deletion'])
|
259
|
+
list_insert.append(a_row[1]['rv_insertion'])
|
260
|
+
list_mismat.append(a_row[1]['rv_mismatch'])
|
261
|
+
if a_row[1]['rv_target'][-2:]=='GG':
|
262
|
+
list_GG.append('OK')
|
263
|
+
else:
|
264
|
+
list_GG.append('NO')
|
265
|
+
else:
|
266
|
+
if a_row[1]['fw_target'][-2:]=='GG':
|
267
|
+
list_best_target.append(a_row[1]['fw_target'])
|
268
|
+
list_best_location.append(a_row[1]['fw_location'])
|
269
|
+
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
270
|
+
list_delete.append(a_row[1]['fw_deletion'])
|
271
|
+
list_insert.append(a_row[1]['fw_insertion'])
|
272
|
+
list_mismat.append(a_row[1]['fw_mismatch'])
|
273
|
+
list_GG.append('OK_same_score')
|
274
|
+
# 发现没有 GG 则看 RC
|
275
|
+
elif a_row[1]['rv_target'][-2:]=='GG':
|
276
|
+
list_best_target.append(a_row[1]['rv_target'])
|
277
|
+
list_best_location.append(a_row[1]['rv_location'])
|
278
|
+
list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
|
279
|
+
list_delete.append(a_row[1]['rv_deletion'])
|
280
|
+
list_insert.append(a_row[1]['rv_insertion'])
|
281
|
+
list_mismat.append(a_row[1]['rv_mismatch'])
|
282
|
+
list_GG.append('OK_same_score')
|
283
|
+
else:
|
284
|
+
list_best_target.append(a_row[1]['fw_target'])
|
285
|
+
list_best_location.append(a_row[1]['fw_location'])
|
286
|
+
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
287
|
+
list_delete.append(a_row[1]['fw_deletion'])
|
288
|
+
list_insert.append(a_row[1]['fw_insertion'])
|
289
|
+
list_mismat.append(a_row[1]['fw_mismatch'])
|
290
|
+
list_GG.append('NO_same_score')
|
291
|
+
# 记入 df_candidate
|
292
|
+
df_candidate['deletion'] = list_delete
|
293
|
+
df_candidate['insertion'] = list_insert
|
294
|
+
df_candidate['mismatch'] = list_mismat
|
295
|
+
df_candidate['GG'] = list_GG
|
296
|
+
df_candidate['best_target'] = list_best_target
|
297
|
+
df_candidate['target_location'] = list_best_location
|
298
|
+
df_candidate['cleavage_site'] = list_cleavage_site
|
299
|
+
|
300
|
+
# 2.0 更新一下格式
|
301
|
+
df_candidate = df_candidate.drop_duplicates(subset=['target_location']).reset_index(drop=True)
|
302
|
+
df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate],axis=1)
|
303
|
+
# df_candidate['midpoint'] = ((df_candidate['ed'] + df_candidate['st'])/2).astype(int)
|
304
|
+
df_candidate = xseq.add_ID(df_candidate, midpoint='cleavage_site')
|
305
|
+
|
306
|
+
df_candidate.to_csv(dir_df_candidate)
|
307
|
+
print(f'Output df_candidate_{sgRNA_name}.csv')
|
308
|
+
os.remove(temp_bed)
|
309
|
+
|
310
|
+
return 'Done!'
|
311
|
+
|
312
|
+
|
313
|
+
if __name__ == '__main__' :
|
314
|
+
result = main()
|
315
|
+
print(result)
|
316
|
+
|
317
|
+
|
318
|
+
|
@@ -1,20 +1,22 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
2
|
# -*- coding: utf-8 -*-
|
3
3
|
|
4
|
-
# 2023.08.11.
|
4
|
+
# 2023.08.11. adding a option for not normalizing the bw file
|
5
|
+
# 2025.05.22. refine the structure
|
6
|
+
# 2025.06.05. 增加 ignore_chr 选项,默认只取 common chromosomes,用于 1.1_bed2fr.py
|
5
7
|
|
6
8
|
import argparse
|
7
9
|
import os, glob, yaml
|
8
10
|
import pandas as pd
|
9
11
|
import shutil, re
|
10
12
|
import offtracker
|
13
|
+
import offtracker.X_sequence as xseq
|
11
14
|
script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
|
12
|
-
|
13
|
-
os.chmod( os.path.join(script_folder, 'bedGraphToBigWig'), 0o755)
|
15
|
+
utility_dir = os.path.join(script_dir, 'utility')
|
14
16
|
|
15
17
|
###
|
16
18
|
parser = argparse.ArgumentParser()
|
17
|
-
parser.description='Mapping fastq files of
|
19
|
+
parser.description='Mapping fastq files of Tracking-seq.'
|
18
20
|
parser.add_argument('-f','--folder', type=str, required=True, help='Directory of the input folder' )
|
19
21
|
parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
|
20
22
|
parser.add_argument('-i','--index' , type=str, required=True, help='The index file of chromap')
|
@@ -25,12 +27,13 @@ parser.add_argument('-t','--thread', type=int, default=4, help='Number of t
|
|
25
27
|
parser.add_argument('--blacklist' , type=str, default='same', help='Blacklist of genome regions in bed format. "none" for no filter')
|
26
28
|
parser.add_argument('--binsize' , type=str, default=100, help='Bin size for calculating bw residue')
|
27
29
|
parser.add_argument('--normalize' , type=str, default='True', help='Whether to normalize the BigWig file. "True" or "False"')
|
30
|
+
parser.add_argument('--ignore_chr' , action='store_true', help='If not set, only chr1-chr22, chrX, chrY, chrM will be analyzed.')
|
28
31
|
|
29
|
-
args = parser.parse_args()
|
30
32
|
|
33
|
+
args = parser.parse_args()
|
31
34
|
|
32
35
|
if (args.genome == 'hg38') or (args.genome == 'mm10'):
|
33
|
-
dir_chrom_sizes = os.path.join(
|
36
|
+
dir_chrom_sizes = os.path.join(utility_dir, f'{args.genome}.chrom.sizes')
|
34
37
|
else:
|
35
38
|
dir_chrom_sizes = args.genome
|
36
39
|
|
@@ -42,7 +45,7 @@ if args.blacklist == 'same':
|
|
42
45
|
args.blacklist = args.genome
|
43
46
|
|
44
47
|
if (args.blacklist == 'hg38') or (args.blacklist == 'mm10'):
|
45
|
-
blacklist = os.path.join(
|
48
|
+
blacklist = os.path.join(utility_dir, f'offtracker_blacklist_{args.blacklist}.merged.bed')
|
46
49
|
else:
|
47
50
|
blacklist = args.blacklist
|
48
51
|
|
@@ -52,59 +55,40 @@ else:
|
|
52
55
|
if not os.path.exists(args.outdir):
|
53
56
|
os.makedirs(args.outdir)
|
54
57
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
prefix = gz_R2.str.extract('(.*)(?:.fq|.fastq)',expand=False)
|
65
|
-
|
66
|
-
nametype = None
|
67
|
-
for a_type in ['_trimmed_2', '_2_val_2','_R2_val_2','_R2','_2']:
|
68
|
-
len_type = len(a_type)
|
69
|
-
if prefix[0][-len_type:] == a_type:
|
70
|
-
nametype = a_type
|
71
|
-
sample_dir = prefix.str[:-len_type]
|
72
|
-
break
|
73
|
-
|
74
|
-
if nametype is None:
|
75
|
-
# pattern 搜索模式,可能会出 bug
|
76
|
-
# find "_R2." or "_2." in prefix[0]
|
77
|
-
pattern = re.compile(r'(_R2\.|_2\.)')
|
78
|
-
m = pattern.search(prefix[0])
|
79
|
-
if m:
|
80
|
-
nametype = prefix[0][m.span()[0]:]
|
81
|
-
len_type = len(nametype)
|
82
|
-
sample_dir = prefix.str[:-len_type]
|
83
|
-
|
84
|
-
assert nametype is not None, 'No fastq detected or the file name is invaild!'
|
85
|
-
|
86
|
-
sample_name = sample_dir.apply(os.path.basename)
|
58
|
+
if args.ignore_chr:
|
59
|
+
args.ignore_chr = '--ignore_chr'
|
60
|
+
else:
|
61
|
+
args.ignore_chr = ''
|
62
|
+
|
63
|
+
# 搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
|
64
|
+
sample_names, files_R1, files_R2 = xseq.detect_fastq(args.folder, n_subfolder=args.subfolder, NGS_type=args.NGS_type)
|
65
|
+
|
66
|
+
assert not isinstance(sample_names, str), 'No fastq file is detected!'
|
87
67
|
|
88
68
|
dict_yaml = {
|
89
|
-
|
90
|
-
'
|
69
|
+
# fastq 信息
|
70
|
+
'files_R1':dict(zip(sample_names,files_R1)),
|
71
|
+
'files_R2':dict(zip(sample_names,files_R2)), # 单端 files_R2=[] 结果会自动为 {}
|
72
|
+
'NGS_type':args.NGS_type,
|
73
|
+
# 输入输出文件夹
|
91
74
|
'input_dir':args.folder,
|
92
75
|
'output_dir':args.outdir,
|
76
|
+
# 运行参数
|
93
77
|
'thread':args.thread,
|
94
78
|
'index':args.index,
|
95
79
|
'fasta':args.ref,
|
96
80
|
'binsize':args.binsize,
|
97
81
|
'blacklist':blacklist,
|
98
|
-
'nametype':nametype,
|
99
82
|
'genomelen':dir_chrom_sizes,
|
100
83
|
'normalize':args.normalize,
|
101
|
-
'
|
84
|
+
'utility_dir':utility_dir,
|
85
|
+
'ignore_chr':args.ignore_chr,
|
102
86
|
}
|
103
87
|
|
104
88
|
with open( os.path.join(args.outdir,'config.yaml'), 'w') as outfile:
|
105
89
|
yaml.dump(dict_yaml, outfile, default_flow_style=False)
|
106
90
|
|
107
|
-
snakefile = os.path.join(script_dir, '
|
91
|
+
snakefile = os.path.join(script_dir, 'snakefile/Snakefile_offtracker.smk')
|
108
92
|
shutil.copy(snakefile, os.path.join(args.outdir,'Snakefile'))
|
109
93
|
|
110
94
|
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
THIS_VERSION = '0.4.1'
|
5
|
+
|
6
|
+
import argparse
|
7
|
+
import os, glob, yaml
|
8
|
+
import pandas as pd
|
9
|
+
import shutil, re
|
10
|
+
import offtracker
|
11
|
+
import offtracker.X_sequence as xseq
|
12
|
+
|
13
|
+
script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
|
14
|
+
utility_dir = os.path.join(script_dir, 'utility')
|
15
|
+
os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
|
16
|
+
|
17
|
+
###
|
18
|
+
parser = argparse.ArgumentParser()
|
19
|
+
parser.description=f'xbulk_qc v{THIS_VERSION}. QC and trim fastq files.'
|
20
|
+
parser.add_argument('-f','--folder', type=str, required=True, help='Directory of the input folder' )
|
21
|
+
parser.add_argument('-o','--outdir', type=str, default='same', help='The output folder')
|
22
|
+
parser.add_argument('--subfolder' , type=int, default=0, help='subfolder level')
|
23
|
+
parser.add_argument('-t','--thread', type=int, default=8, help='Number of threads to be used')
|
24
|
+
parser.add_argument('--NGS_type' , type=str, default='paired-end', help='paired-end or single-end')
|
25
|
+
|
26
|
+
args = parser.parse_args()
|
27
|
+
|
28
|
+
# 自动化的参数调整和报错
|
29
|
+
if args.outdir == 'same':
|
30
|
+
args.outdir = os.path.join(args.folder,'Trimmed_data')
|
31
|
+
if not os.path.exists( args.outdir ):
|
32
|
+
os.makedirs( args.outdir )
|
33
|
+
else:
|
34
|
+
if not os.path.exists(args.outdir):
|
35
|
+
os.makedirs(args.outdir)
|
36
|
+
|
37
|
+
# 搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
|
38
|
+
sample_names, files_R1, files_R2 = xseq.detect_fastq(args.folder, n_subfolder=args.subfolder, NGS_type=args.NGS_type)
|
39
|
+
|
40
|
+
assert not isinstance(sample_names, str), 'No fastq file is detected!'
|
41
|
+
|
42
|
+
dict_yaml = {
|
43
|
+
# fastq 信息
|
44
|
+
'files_R1':dict(zip(sample_names,files_R1)),
|
45
|
+
'files_R2':dict(zip(sample_names,files_R2)), # 单端 files_R2=[] 结果会自动为 {}
|
46
|
+
'NGS_type':args.NGS_type,
|
47
|
+
# 输入输出文件夹
|
48
|
+
'input_dir':args.folder,
|
49
|
+
'output_dir':args.outdir,
|
50
|
+
# 运行参数
|
51
|
+
'thread':args.thread,
|
52
|
+
'utility_dir':utility_dir
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
with open( os.path.join(args.outdir,'config.yaml'), 'w', encoding='utf-8') as outfile:
|
57
|
+
yaml.dump(dict_yaml, outfile, default_flow_style=False)
|
58
|
+
|
59
|
+
snakefile = os.path.join(script_dir, 'snakefile/Snakefile_QC.smk')
|
60
|
+
shutil.copy(snakefile, os.path.join(args.outdir,'Snakefile'))
|
61
|
+
|
62
|
+
|
@@ -11,7 +11,7 @@ from setuptools import find_packages, setup, Command
|
|
11
11
|
NAME = 'offtracker'
|
12
12
|
DESCRIPTION = 'Tracking-seq data analysis'
|
13
13
|
AUTHOR = 'Runda Xu'
|
14
|
-
EMAIL = '
|
14
|
+
EMAIL = 'xrd18@tsinghua.org.cn'
|
15
15
|
URL = 'https://github.com/Lan-lab/offtracker'
|
16
16
|
REQUIRES_PYTHON = '>=3.6.0'
|
17
17
|
|
@@ -47,9 +47,10 @@ setup(
|
|
47
47
|
author_email=EMAIL,
|
48
48
|
url=URL,
|
49
49
|
python_requires=REQUIRES_PYTHON,
|
50
|
-
packages=
|
51
|
-
package_data={'offtracker': ['
|
52
|
-
scripts = ['scripts/
|
50
|
+
packages=['offtracker'],
|
51
|
+
package_data={'offtracker': ['snakefile/*','utility/*']},
|
52
|
+
scripts = ['scripts/offtracker_qc.py',
|
53
|
+
'scripts/offtracker_config.py',
|
53
54
|
'scripts/offtracker_candidates.py',
|
54
55
|
'scripts/offtracker_analysis.py',
|
55
56
|
'scripts/offtracker_plot.py'],
|