offtracker 2.7.8__zip → 2.10.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. offtracker-2.10.0/PKG-INFO +233 -0
  2. offtracker-2.10.0/README.md +221 -0
  3. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_offplot.py +37 -8
  4. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_sequence.py +113 -7
  5. offtracker-2.10.0/offtracker/_version.py +36 -0
  6. offtracker-2.10.0/offtracker/snakefile/Snakefile_QC.smk +66 -0
  7. offtracker-2.10.0/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
  8. offtracker-2.7.8/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.0/offtracker/utility/1.1_bed2fr.py +6 -4
  9. offtracker-2.10.0/offtracker.egg-info/PKG-INFO +233 -0
  10. offtracker-2.10.0/offtracker.egg-info/SOURCES.txt +28 -0
  11. {offtracker-2.7.8 → offtracker-2.10.0}/scripts/offtracker_analysis.py +20 -5
  12. offtracker-2.10.0/scripts/offtracker_candidates.py +318 -0
  13. {offtracker-2.7.8 → offtracker-2.10.0}/scripts/offtracker_config.py +28 -44
  14. offtracker-2.10.0/scripts/offtracker_plot.py +39 -0
  15. offtracker-2.10.0/scripts/offtracker_qc.py +62 -0
  16. {offtracker-2.7.8 → offtracker-2.10.0}/setup.py +8 -4
  17. offtracker-2.7.8/PKG-INFO +0 -146
  18. offtracker-2.7.8/README.md +0 -134
  19. offtracker-2.7.8/offtracker/_version.py +0 -28
  20. offtracker-2.7.8/offtracker/mapping/Snakefile_offtracker +0 -245
  21. offtracker-2.7.8/offtracker.egg-info/PKG-INFO +0 -146
  22. offtracker-2.7.8/offtracker.egg-info/SOURCES.txt +0 -25
  23. offtracker-2.7.8/scripts/offtracker_candidates.py +0 -307
  24. {offtracker-2.7.8 → offtracker-2.10.0}/LICENSE.txt +0 -0
  25. {offtracker-2.7.8 → offtracker-2.10.0}/MANIFEST.in +0 -0
  26. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_offtracker.py +0 -0
  27. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/__init__.py +0 -0
  28. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
  29. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/bedGraphToBigWig +0 -0
  30. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/hg38.chrom.sizes +0 -0
  31. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/mm10.chrom.sizes +0 -0
  32. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
  33. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
  34. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/dependency_links.txt +0 -0
  35. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/requires.txt +0 -0
  36. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/top_level.txt +0 -0
  37. {offtracker-2.7.8 → offtracker-2.10.0}/setup.cfg +0 -0
@@ -1,307 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # 2023.10.27. v2.0: 2.0以target_location midpoint为中心,因此取消 pct 计算
5
- # 2023.12.06. v2.1: 2.1增加 cleavage_site 推测, 修正 deletion 错位, 以 cleavage_site 为中心
6
- import os,sys,re,time
7
- from itertools import product
8
-
9
- if sys.version_info < (3,0):
10
- import platform
11
- raise Exception(f'python3 is needed, while running {platform.python_version()} now')
12
-
13
- import offtracker
14
- import offtracker.X_sequence as xseq
15
- script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
16
- script_folder= os.path.join(script_dir, 'mapping')
17
-
18
- import argparse
19
- import pandas as pd
20
- import pybedtools
21
- import multiprocessing as mp
22
- from Bio.Blast.Applications import NcbiblastnCommandline
23
-
24
- def main():
25
- parser = argparse.ArgumentParser()
26
- parser.description='Generate candidate regions by sgRNA sequence'
27
- parser.add_argument('--sgrna' , type=str, required=True, help='sgRNA sequence without PAM' )
28
- parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
29
- parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
30
- parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
31
- parser.add_argument('-b','--blastdb', type=str, required=True, help='blast database')
32
- parser.add_argument('-o','--outdir' , type=str, required=True, help='The output folder')
33
- parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
34
- parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
35
- parser.add_argument('--quick_mode' , action='store_true', help='BLAST faster but less candidates.')
36
-
37
- args = parser.parse_args()
38
-
39
-
40
- if (args.genome == 'hg38') or (args.genome == 'mm10'):
41
- dir_chrom_sizes = os.path.join(script_folder, f'{args.genome}.chrom.sizes')
42
- else:
43
- dir_chrom_sizes = args.genome
44
-
45
- sgRNA_name = args.name
46
- sgRNA_seq = args.sgrna
47
- PAM = args.pam
48
- n_threads = args.thread
49
- dir_output = args.outdir
50
- if not os.path.exists(dir_output):
51
- os.makedirs(dir_output)
52
- dir_ref_fa = args.ref
53
- blast_db = args.blastdb
54
- quick_mode = args.quick_mode
55
-
56
- # parameters for alignment
57
- half_width = 100
58
- pct_params = 1.0
59
- frag_len= half_width*2
60
- dir_df_candidate = os.path.join(dir_output, f'df_candidate_{sgRNA_name}.csv')
61
-
62
-
63
- sgRNA_seq = sgRNA_seq.upper()
64
- PAM = PAM.upper()
65
- dir_sgRNA_fasta = os.path.join(dir_output, f'{sgRNA_name}_PAM.fasta')
66
- dir_sgRNA_blast = os.path.join(dir_output, f'{sgRNA_name}_PAM.blast')
67
- dir_sgRNA_bed = os.path.join(dir_output, f'{sgRNA_name}_PAM.bed')
68
-
69
-
70
- possible_sgRNA_PAM = list(product([sgRNA_seq],xseq.possible_seq(PAM)))
71
- possible_sgRNA_PAM = [''.join(combination) for combination in possible_sgRNA_PAM]
72
- n_seq = len(possible_sgRNA_PAM)
73
-
74
- ID = pd.Series(['seq']*n_seq) + pd.Series(range(1,n_seq+1)).astype(str)
75
- df_sgRNA_PAM = pd.DataFrame({'ID':ID,'sequence':possible_sgRNA_PAM})
76
- xseq.write_fasta(df_sgRNA_PAM, dir_sgRNA_fasta)
77
-
78
-
79
-
80
- #########
81
- # BLAST #
82
- #########
83
- if os.path.isfile(dir_sgRNA_blast):
84
- print(f'{dir_sgRNA_blast} exists, skipped.')
85
- else:
86
- if quick_mode:
87
- print('Using quick mode for BLAST')
88
- blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
89
- db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
90
- gapopen=4, gapextend=2, reward=2, word_size=5, dust='no', soft_masking=False)
91
- else:
92
- blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
93
- db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
94
- gapopen=4, gapextend=2, reward=2, word_size=4, dust='no', soft_masking=False)
95
- print(f'BLAST for candidate off-target sites of {sgRNA_name}.')
96
- blastx_cline()
97
- print(f'BLAST finished.')
98
-
99
- ##############
100
- # Output bed #
101
- ##############
102
-
103
- blast_regions = pd.read_csv(dir_sgRNA_blast, sep='\t',header=None)
104
- blast_regions.columns = ['query acc.','chr','% identity','alignment length','mismatches','gap opens','q. start','q. end','st','ed','evalue','bit score']
105
- blast_regions = blast_regions[blast_regions.evalue<10000]
106
-
107
- # reverse strand
108
- blast_regions['reverse'] = (blast_regions['st']>blast_regions['ed']).astype(int)
109
- blast_regions_f = blast_regions[blast_regions.reverse==0].copy()
110
- blast_regions_r = blast_regions[blast_regions.reverse==1].copy()
111
- temp = blast_regions_r['st'].copy()
112
- blast_regions_r['st'] = blast_regions_r['ed']
113
- blast_regions_r['ed'] = temp
114
- blast_regions = pd.concat([blast_regions_f, blast_regions_r])
115
- # sort and add location
116
- blast_regions = blast_regions.sort_values('evalue').reset_index(drop=True)
117
- blast_regions['location']=blast_regions['chr'].str[:] + ':' + blast_regions['st'].astype(str).str[:] + '-' + blast_regions['ed'].astype(str).str[:]
118
- blast_regions = blast_regions.drop_duplicates(subset='location').copy()
119
-
120
- # alignment length 筛选
121
- len_sgRNA=len(sgRNA_seq)
122
- min_len = len_sgRNA-8
123
- blast_regions = blast_regions[blast_regions['alignment length']>=min_len].copy().reset_index(drop=True)
124
- blast_regions = blast_regions.reindex(columns = ['chr', 'st', 'ed' , 'query acc.', '% identity', 'alignment length', 'mismatches',
125
- 'gap opens', 'q. start', 'q. end', 'evalue', 'bit score', 'reverse', 'location'] )
126
-
127
- # 输出 bed 用于后续 alignment score 计算
128
- blast_regions_bed = blast_regions[['chr','st','ed']]
129
- xseq.write_bed(blast_regions_bed, dir_sgRNA_bed)
130
- # 对 bed 进行排序但不合并
131
- a = pybedtools.BedTool(dir_sgRNA_bed)
132
- a.sort(g=dir_chrom_sizes).saveas( dir_sgRNA_bed )
133
- print(f'Output {sgRNA_name}_PAM.bed')
134
-
135
-
136
- ###################
137
- # alignment score #
138
- ###################
139
- if os.path.isfile(dir_df_candidate):
140
- print(f'{dir_df_candidate} exists, skipped.')
141
- else:
142
- #########
143
- # 读取 blast bed
144
- #########
145
- common_chr = pd.Series(['chr']*23).str[:] + pd.Series(range(23)).astype(str).str[:]
146
- common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY'])]).to_numpy()
147
-
148
- bed_short = xseq.X_readbed(dir_sgRNA_bed)
149
- bed_short = bed_short[bed_short['chr'].isin(common_chr)].copy()
150
- bed_short['midpoint'] = ((bed_short['st'] + bed_short['ed'])/2).astype(int)
151
- bed_short['st'] = bed_short['midpoint'] - half_width
152
- bed_short['ed'] = bed_short['midpoint'] + half_width
153
- bed_short.loc[bed_short['st']<0,'st']=0
154
- bed_short = bed_short.drop_duplicates()
155
-
156
- #########
157
- # 根据 bed_f 位点 ed 前后 half_width 取基因组序列
158
- #########
159
-
160
- temp_bed = os.path.join(dir_output, 'temp.bed')
161
- xseq.write_bed(bed_short.iloc[:,:3], temp_bed)
162
- a = pybedtools.BedTool(temp_bed)
163
- fasta = pybedtools.example_filename(dir_ref_fa)
164
- a = a.sequence(fi=fasta)
165
- with open(a.seqfn) as f:
166
- fasta = {}
167
- for line in f:
168
- line = line.strip() # 去除末尾换行符
169
- if line[0] == '>':
170
- header = line[1:]
171
- else:
172
- sequence = line
173
- fasta[header] = fasta.get(header,'') + sequence
174
-
175
- # pybedtools 得到位置 chrA:X-Y 时,X数字会往左多1bp
176
-
177
- #########
178
- # local alignment
179
- #########
180
- DNA_matrix = {('A','A'): 2, ('A','T'):0.01, ('A','C'):0.01, ('A','G'):0.01, ('A','N'):0.01,
181
- ('T','T'): 2, ('T','A'):0.01, ('T','C'):0.01, ('T','G'):0.01, ('T','N'):0.01,
182
- ('G','G'): 2, ('G','A'):0.01, ('G','C'):0.01, ('G','T'):0.01, ('G','N'):0.01,
183
- ('C','C'): 2, ('C','A'):0.01, ('C','G'):0.01, ('C','T'):0.01, ('C','N'):0.01,
184
- ('N','N'): 2, ('N','C'):2, ('N','A'): 2, ('N','G'): 2, ('N','T'): 2}
185
- mismatch_score = 0.01
186
- # 添加 PAM
187
- sgRNA_PAM_fw = sgRNA_seq + PAM
188
- sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
189
-
190
- list_args_fw=[]
191
- list_args_rv=[]
192
- for a_key in fasta.keys():
193
- seq = re.sub('[^ATCG]','N',fasta[a_key])
194
- list_args_fw.append( [a_key, sgRNA_PAM_fw, seq, frag_len, DNA_matrix, mismatch_score] )
195
- list_args_rv.append( [a_key, sgRNA_PAM_rv, seq, frag_len, DNA_matrix, mismatch_score] )
196
- st = time.time()
197
- with mp.Pool(n_threads) as p:
198
- list_align_forward = p.starmap(xseq.sgRNA_alignment, list_args_fw)
199
- ed = time.time()
200
- print('align_forward:{:.2f}'.format(ed-st))
201
- st = time.time()
202
- with mp.Pool(n_threads) as p:
203
- list_align_reverse = p.starmap(xseq.sgRNA_alignment, list_args_rv)
204
- ed = time.time()
205
- print('align_reverse:{:.2f}'.format(ed-st))
206
- #
207
- df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score','fw_pct','fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
208
- df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score','rv_pct','rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
209
- df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
210
- df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
211
- df_candidate['location'] = fasta.keys()
212
- df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
213
- #df_candidate['fw_score_2'] = df_candidate['fw_score']*(pct_params-df_candidate['fw_pct'].abs())
214
- #df_candidate['rv_score_2'] = df_candidate['rv_score']*(pct_params-df_candidate['rv_pct'].abs())
215
- #df_candidate['best_seq_score'] = df_candidate[['fw_score_2', 'rv_score_2']].max(axis=1)
216
- #df_candidate['best_strand'] = df_candidate[['fw_score_2', 'rv_score_2']].idxmax(axis='columns').replace({'fw_score_2':'+', 'rv_score_2':'-'})
217
- #df_candidate.loc[df_candidate['fw_score_2']==df_candidate['rv_score_2'],'best_strand']='equal_score'
218
- df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
219
- df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
220
- df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
221
-
222
- # GG check
223
- # 2023.12.05 增加 cleavage_site 推测
224
- list_best_target = []
225
- list_best_location = []
226
- list_cleavage_site = []
227
- list_delete = []
228
- list_insert = []
229
- list_mismat = []
230
- list_GG = []
231
- for a_row in df_candidate.iterrows():
232
- if a_row[1]['best_strand']=='+':
233
- list_best_target.append(a_row[1]['fw_target'])
234
- list_best_location.append(a_row[1]['fw_location'])
235
- list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
236
- list_delete.append(a_row[1]['fw_deletion'])
237
- list_insert.append(a_row[1]['fw_insertion'])
238
- list_mismat.append(a_row[1]['fw_mismatch'])
239
- if a_row[1]['fw_target'][-2:]=='GG':
240
- list_GG.append('OK')
241
- else:
242
- list_GG.append('NO')
243
- elif a_row[1]['best_strand']=='-':
244
- list_best_target.append(a_row[1]['rv_target'])
245
- list_best_location.append(a_row[1]['rv_location'])
246
- list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
247
- list_delete.append(a_row[1]['rv_deletion'])
248
- list_insert.append(a_row[1]['rv_insertion'])
249
- list_mismat.append(a_row[1]['rv_mismatch'])
250
- if a_row[1]['rv_target'][-2:]=='GG':
251
- list_GG.append('OK')
252
- else:
253
- list_GG.append('NO')
254
- else:
255
- if a_row[1]['fw_target'][-2:]=='GG':
256
- list_best_target.append(a_row[1]['fw_target'])
257
- list_best_location.append(a_row[1]['fw_location'])
258
- list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
259
- list_delete.append(a_row[1]['fw_deletion'])
260
- list_insert.append(a_row[1]['fw_insertion'])
261
- list_mismat.append(a_row[1]['fw_mismatch'])
262
- list_GG.append('OK_same_score')
263
- # 发现没有 GG 则看 RC
264
- elif a_row[1]['rv_target'][-2:]=='GG':
265
- list_best_target.append(a_row[1]['rv_target'])
266
- list_best_location.append(a_row[1]['rv_location'])
267
- list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
268
- list_delete.append(a_row[1]['rv_deletion'])
269
- list_insert.append(a_row[1]['rv_insertion'])
270
- list_mismat.append(a_row[1]['rv_mismatch'])
271
- list_GG.append('OK_same_score')
272
- else:
273
- list_best_target.append(a_row[1]['fw_target'])
274
- list_best_location.append(a_row[1]['fw_location'])
275
- list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
276
- list_delete.append(a_row[1]['fw_deletion'])
277
- list_insert.append(a_row[1]['fw_insertion'])
278
- list_mismat.append(a_row[1]['fw_mismatch'])
279
- list_GG.append('NO_same_score')
280
- # 记入 df_candidate
281
- df_candidate['deletion'] = list_delete
282
- df_candidate['insertion'] = list_insert
283
- df_candidate['mismatch'] = list_mismat
284
- df_candidate['GG'] = list_GG
285
- df_candidate['best_target'] = list_best_target
286
- df_candidate['target_location'] = list_best_location
287
- df_candidate['cleavage_site'] = list_cleavage_site
288
-
289
- # 2.0 更新一下格式
290
- df_candidate = df_candidate.drop_duplicates(subset=['target_location']).reset_index(drop=True)
291
- df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate],axis=1)
292
- # df_candidate['midpoint'] = ((df_candidate['ed'] + df_candidate['st'])/2).astype(int)
293
- df_candidate = xseq.add_ID(df_candidate, midpoint='cleavage_site')
294
-
295
- df_candidate.to_csv(dir_df_candidate)
296
- print(f'Output df_candidate_{sgRNA_name}.csv')
297
- os.remove(temp_bed)
298
-
299
- return 'Done!'
300
-
301
-
302
- if __name__ == '__main__' :
303
- result = main()
304
- print(result)
305
-
306
-
307
-
File without changes
File without changes
File without changes