offtracker 2.7.10__zip → 2.10.1__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {offtracker-2.7.10 → offtracker-2.10.1}/PKG-INFO +64 -20
  2. offtracker-2.7.10/offtracker.egg-info/PKG-INFO → offtracker-2.10.1/README.md +221 -189
  3. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/X_offplot.py +13 -2
  4. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/X_sequence.py +113 -7
  5. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/_version.py +9 -2
  6. offtracker-2.10.1/offtracker/snakefile/Snakefile_QC.smk +66 -0
  7. offtracker-2.10.1/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
  8. offtracker-2.7.10/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.1/offtracker/utility/1.1_bed2fr.py +6 -4
  9. offtracker-2.7.10/README.md → offtracker-2.10.1/offtracker.egg-info/PKG-INFO +233 -177
  10. offtracker-2.10.1/offtracker.egg-info/SOURCES.txt +28 -0
  11. {offtracker-2.7.10 → offtracker-2.10.1}/scripts/offtracker_analysis.py +10 -3
  12. offtracker-2.10.1/scripts/offtracker_candidates.py +318 -0
  13. {offtracker-2.7.10 → offtracker-2.10.1}/scripts/offtracker_config.py +28 -44
  14. offtracker-2.10.1/scripts/offtracker_qc.py +62 -0
  15. {offtracker-2.7.10 → offtracker-2.10.1}/setup.py +5 -4
  16. offtracker-2.7.10/offtracker/mapping/Snakefile_offtracker +0 -245
  17. offtracker-2.7.10/offtracker.egg-info/SOURCES.txt +0 -26
  18. offtracker-2.7.10/scripts/offtracker_candidates.py +0 -307
  19. {offtracker-2.7.10 → offtracker-2.10.1}/LICENSE.txt +0 -0
  20. {offtracker-2.7.10 → offtracker-2.10.1}/MANIFEST.in +0 -0
  21. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/X_offtracker.py +0 -0
  22. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker/__init__.py +0 -0
  23. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
  24. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/bedGraphToBigWig +0 -0
  25. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/hg38.chrom.sizes +0 -0
  26. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/mm10.chrom.sizes +0 -0
  27. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
  28. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.1/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
  29. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker.egg-info/dependency_links.txt +0 -0
  30. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker.egg-info/requires.txt +0 -0
  31. {offtracker-2.7.10 → offtracker-2.10.1}/offtracker.egg-info/top_level.txt +0 -0
  32. {offtracker-2.7.10 → offtracker-2.10.1}/scripts/offtracker_plot.py +0 -0
  33. {offtracker-2.7.10 → offtracker-2.10.1}/setup.cfg +0 -0
@@ -0,0 +1,318 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # 2023.10.27. v2.0: 2.0以target_location midpoint为中心,因此取消 pct 计算
5
+ # 2023.12.06. v2.1: 2.1增加 cleavage_site 推测, 修正 deletion 错位, 以 cleavage_site 为中心
6
+ # 2025.04.25. 修正大小写问题
7
+ # 2025.06.11. 调整跳过已存在的candidates的代码顺序
8
+
9
+ import os,sys,re,time
10
+ from itertools import product, permutations
11
+
12
+ if sys.version_info < (3,0):
13
+ import platform
14
+ raise Exception(f'python3 is needed, while running {platform.python_version()} now')
15
+
16
+ import offtracker
17
+ import offtracker.X_sequence as xseq
18
+ script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
19
+ script_folder= os.path.join(script_dir, 'utility')
20
+
21
+ import argparse
22
+ import pandas as pd
23
+ import pybedtools
24
+ import multiprocessing as mp
25
+ from Bio.Blast.Applications import NcbiblastnCommandline
26
+
27
+ def main():
28
+ parser = argparse.ArgumentParser()
29
+ parser.description='Generate candidate regions by sgRNA sequence'
30
+ parser.add_argument('--sgrna' , type=str, required=True, help='One sgRNA sequence without PAM' )
31
+ parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
32
+ parser.add_argument('--pam_location', type=str, default='downstream', help='Upstream or downstream, default is downstream (Cas9)' )
33
+ parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
34
+ parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
35
+ parser.add_argument('-b','--blastdb', type=str, required=True, help='blast database')
36
+ parser.add_argument('-o','--outdir' , type=str, required=True, help='The output folder')
37
+ parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
38
+ parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
39
+ parser.add_argument('--quick_mode' , action='store_true', help='BLAST faster but less candidates.')
40
+
41
+ args = parser.parse_args()
42
+
43
+
44
+ if (args.genome == 'hg38') or (args.genome == 'mm10'):
45
+ dir_chrom_sizes = os.path.join(script_folder, f'{args.genome}.chrom.sizes')
46
+ else:
47
+ dir_chrom_sizes = args.genome
48
+
49
+ sgRNA_name = args.name
50
+ sgRNA_seq = args.sgrna
51
+ PAM = args.pam
52
+ PAM_loc = args.pam_location.lower()
53
+ n_threads = args.thread
54
+ dir_output = args.outdir
55
+ if not os.path.exists(dir_output):
56
+ os.makedirs(dir_output)
57
+ dir_ref_fa = args.ref
58
+ blast_db = args.blastdb
59
+ quick_mode = args.quick_mode
60
+
61
+ # parameters for alignment
62
+ half_width = 100
63
+ pct_params = 1.0
64
+ frag_len= half_width*2
65
+ dir_df_candidate = os.path.join(dir_output, f'df_candidate_{sgRNA_name}.csv')
66
+ if os.path.isfile(dir_df_candidate):
67
+ print(f'{dir_df_candidate} exists, skipped.')
68
+ return 'skipped'
69
+
70
+ sgRNA_seq = sgRNA_seq.upper()
71
+ PAM = PAM.upper()
72
+ dir_sgRNA_fasta = os.path.join(dir_output, f'{sgRNA_name}_PAM.fasta')
73
+ dir_sgRNA_blast = os.path.join(dir_output, f'{sgRNA_name}_PAM.blast')
74
+ dir_sgRNA_bed = os.path.join(dir_output, f'{sgRNA_name}_PAM.bed')
75
+
76
+ if PAM_loc == 'downstream':
77
+ possible_sgRNA_PAM = list(product([sgRNA_seq],xseq.possible_seq(PAM)))
78
+ elif PAM_loc == 'upstream':
79
+ possible_sgRNA_PAM = list(product(xseq.possible_seq(PAM),[sgRNA_seq]))
80
+ else:
81
+ raise Exception(f'PAM_location should be "upstream" or "downstream", while {PAM_loc} is given.')
82
+ possible_sgRNA_PAM = [''.join(combination) for combination in possible_sgRNA_PAM]
83
+ n_seq = len(possible_sgRNA_PAM)
84
+
85
+ ID = pd.Series(['seq']*n_seq) + pd.Series(range(1,n_seq+1)).astype(str)
86
+ df_sgRNA_PAM = pd.DataFrame({'ID':ID,'sequence':possible_sgRNA_PAM})
87
+ xseq.write_fasta(df_sgRNA_PAM, dir_sgRNA_fasta)
88
+
89
+ #########
90
+ # BLAST #
91
+ #########
92
+ if os.path.isfile(dir_sgRNA_blast):
93
+ print(f'{dir_sgRNA_blast} exists, skipped.')
94
+ else:
95
+ if quick_mode:
96
+ print('Using quick mode for BLAST')
97
+ blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
98
+ db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
99
+ gapopen=4, gapextend=2, reward=2, word_size=5, dust='no', soft_masking=False)
100
+ else:
101
+ blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
102
+ db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
103
+ gapopen=4, gapextend=2, reward=2, word_size=4, dust='no', soft_masking=False)
104
+ print(f'BLAST for candidate off-target sites of {sgRNA_name}.')
105
+ blastx_cline()
106
+ print(f'BLAST finished.')
107
+
108
+ ##############
109
+ # Output bed #
110
+ ##############
111
+
112
+ blast_regions = pd.read_csv(dir_sgRNA_blast, sep='\t',header=None)
113
+ blast_regions.columns = ['query acc.','chr','% identity','alignment length','mismatches','gap opens','q. start','q. end','st','ed','evalue','bit score']
114
+ blast_regions = blast_regions[blast_regions.evalue<10000]
115
+
116
+ # reverse strand
117
+ blast_regions['reverse'] = (blast_regions['st']>blast_regions['ed']).astype(int)
118
+ blast_regions_f = blast_regions[blast_regions.reverse==0].copy()
119
+ blast_regions_r = blast_regions[blast_regions.reverse==1].copy()
120
+ temp = blast_regions_r['st'].copy()
121
+ blast_regions_r['st'] = blast_regions_r['ed']
122
+ blast_regions_r['ed'] = temp
123
+ blast_regions = pd.concat([blast_regions_f, blast_regions_r])
124
+ # sort and add location
125
+ blast_regions = blast_regions.sort_values('evalue').reset_index(drop=True)
126
+ blast_regions['location']=blast_regions['chr'].str[:] + ':' + blast_regions['st'].astype(str).str[:] + '-' + blast_regions['ed'].astype(str).str[:]
127
+ blast_regions = blast_regions.drop_duplicates(subset='location').copy()
128
+
129
+ # alignment length 筛选
130
+ len_sgRNA=len(sgRNA_seq)
131
+ min_len = len_sgRNA-8
132
+ blast_regions = blast_regions[blast_regions['alignment length']>=min_len].copy().reset_index(drop=True)
133
+ blast_regions = blast_regions.reindex(columns = ['chr', 'st', 'ed' , 'query acc.', '% identity', 'alignment length', 'mismatches',
134
+ 'gap opens', 'q. start', 'q. end', 'evalue', 'bit score', 'reverse', 'location'] )
135
+
136
+ # 输出 bed 用于后续 alignment score 计算
137
+ blast_regions_bed = blast_regions[['chr','st','ed']]
138
+ xseq.write_bed(blast_regions_bed, dir_sgRNA_bed)
139
+ # 对 bed 进行排序但不合并
140
+ a = pybedtools.BedTool(dir_sgRNA_bed)
141
+ a.sort(g=dir_chrom_sizes).saveas( dir_sgRNA_bed )
142
+ print(f'Output {sgRNA_name}_PAM.bed')
143
+
144
+
145
+ ###################
146
+ # alignment score #
147
+ ###################
148
+
149
+ #########
150
+ # 读取 blast bed
151
+ #########
152
+ common_chr = pd.Series(['chr']*23).str[:] + pd.Series(range(23)).astype(str).str[:]
153
+ common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY'])]).to_numpy()
154
+
155
+ bed_short = xseq.X_readbed(dir_sgRNA_bed)
156
+ bed_short = bed_short[bed_short['chr'].isin(common_chr)].copy()
157
+ bed_short['midpoint'] = ((bed_short['st'] + bed_short['ed'])/2).astype(int)
158
+ bed_short['st'] = bed_short['midpoint'] - half_width
159
+ bed_short['ed'] = bed_short['midpoint'] + half_width
160
+ bed_short.loc[bed_short['st']<0,'st']=0
161
+ bed_short = bed_short.drop_duplicates()
162
+
163
+ #########
164
+ # 根据 bed_f 位点 ed 前后 half_width 取基因组序列
165
+ #########
166
+
167
+ temp_bed = os.path.join(dir_output, 'temp.bed')
168
+ xseq.write_bed(bed_short.iloc[:,:3], temp_bed)
169
+ a = pybedtools.BedTool(temp_bed)
170
+ fasta = pybedtools.example_filename(dir_ref_fa)
171
+ a = a.sequence(fi=fasta)
172
+ with open(a.seqfn) as f:
173
+ fasta = {}
174
+ for line in f:
175
+ line = line.strip() # 去除末尾换行符
176
+ if line[0] == '>':
177
+ header = line[1:]
178
+ else:
179
+ sequence = line
180
+ fasta[header] = fasta.get(header,'') + sequence
181
+
182
+ # pybedtools 得到位置 chrA:X-Y 时,X数字会往左多1bp
183
+
184
+ #########
185
+ # local alignment
186
+ #########
187
+ # 生成 DNA_matrix
188
+ mismatch_score = 0.01
189
+ base_codes = list(xseq.ambiguous_nt.keys())
190
+ all_base_pairs = list(permutations(base_codes,2)) + [(x,x) for x in base_codes]
191
+ DNA_matrix = {x : xseq.get_base_score(*x, mismatch_score=mismatch_score) for x in all_base_pairs}
192
+ # 添加 PAM
193
+ if PAM_loc == 'downstream':
194
+ sgRNA_PAM_fw = sgRNA_seq + PAM
195
+ else:
196
+ sgRNA_PAM_fw = PAM + sgRNA_seq
197
+
198
+ sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
199
+
200
+ list_args_fw=[]
201
+ list_args_rv=[]
202
+ for a_key, a_seq in fasta.items():
203
+ # 2025.04.25 修正大小写问题
204
+ a_seq = re.sub('[^ATCG]','N',a_seq.upper())
205
+ list_args_fw.append( [a_key, sgRNA_PAM_fw, a_seq, frag_len, DNA_matrix, mismatch_score] )
206
+ list_args_rv.append( [a_key, sgRNA_PAM_rv, a_seq, frag_len, DNA_matrix, mismatch_score] )
207
+ st = time.time()
208
+ with mp.Pool(n_threads) as p:
209
+ list_align_forward = p.starmap(xseq.sgRNA_alignment, list_args_fw)
210
+ ed = time.time()
211
+ print('align_forward:{:.2f}'.format(ed-st))
212
+ st = time.time()
213
+ with mp.Pool(n_threads) as p:
214
+ list_align_reverse = p.starmap(xseq.sgRNA_alignment, list_args_rv)
215
+ ed = time.time()
216
+ print('align_reverse:{:.2f}'.format(ed-st))
217
+ #
218
+ df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score','fw_pct','fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
219
+ df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score','rv_pct','rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
220
+ df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
221
+ df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
222
+ df_candidate['location'] = fasta.keys()
223
+ df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
224
+ #df_candidate['fw_score_2'] = df_candidate['fw_score']*(pct_params-df_candidate['fw_pct'].abs())
225
+ #df_candidate['rv_score_2'] = df_candidate['rv_score']*(pct_params-df_candidate['rv_pct'].abs())
226
+ #df_candidate['best_seq_score'] = df_candidate[['fw_score_2', 'rv_score_2']].max(axis=1)
227
+ #df_candidate['best_strand'] = df_candidate[['fw_score_2', 'rv_score_2']].idxmax(axis='columns').replace({'fw_score_2':'+', 'rv_score_2':'-'})
228
+ #df_candidate.loc[df_candidate['fw_score_2']==df_candidate['rv_score_2'],'best_strand']='equal_score'
229
+ df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
230
+ df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
231
+ df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
232
+
233
+ # GG check
234
+ # 2023.12.05 增加 cleavage_site 推测
235
+ list_best_target = []
236
+ list_best_location = []
237
+ list_cleavage_site = []
238
+ list_delete = []
239
+ list_insert = []
240
+ list_mismat = []
241
+ list_GG = []
242
+ for a_row in df_candidate.iterrows():
243
+ if a_row[1]['best_strand']=='+':
244
+ list_best_target.append(a_row[1]['fw_target'])
245
+ list_best_location.append(a_row[1]['fw_location'])
246
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
247
+ list_delete.append(a_row[1]['fw_deletion'])
248
+ list_insert.append(a_row[1]['fw_insertion'])
249
+ list_mismat.append(a_row[1]['fw_mismatch'])
250
+ if a_row[1]['fw_target'][-2:]=='GG':
251
+ list_GG.append('OK')
252
+ else:
253
+ list_GG.append('NO')
254
+ elif a_row[1]['best_strand']=='-':
255
+ list_best_target.append(a_row[1]['rv_target'])
256
+ list_best_location.append(a_row[1]['rv_location'])
257
+ list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
258
+ list_delete.append(a_row[1]['rv_deletion'])
259
+ list_insert.append(a_row[1]['rv_insertion'])
260
+ list_mismat.append(a_row[1]['rv_mismatch'])
261
+ if a_row[1]['rv_target'][-2:]=='GG':
262
+ list_GG.append('OK')
263
+ else:
264
+ list_GG.append('NO')
265
+ else:
266
+ if a_row[1]['fw_target'][-2:]=='GG':
267
+ list_best_target.append(a_row[1]['fw_target'])
268
+ list_best_location.append(a_row[1]['fw_location'])
269
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
270
+ list_delete.append(a_row[1]['fw_deletion'])
271
+ list_insert.append(a_row[1]['fw_insertion'])
272
+ list_mismat.append(a_row[1]['fw_mismatch'])
273
+ list_GG.append('OK_same_score')
274
+ # 发现没有 GG 则看 RC
275
+ elif a_row[1]['rv_target'][-2:]=='GG':
276
+ list_best_target.append(a_row[1]['rv_target'])
277
+ list_best_location.append(a_row[1]['rv_location'])
278
+ list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
279
+ list_delete.append(a_row[1]['rv_deletion'])
280
+ list_insert.append(a_row[1]['rv_insertion'])
281
+ list_mismat.append(a_row[1]['rv_mismatch'])
282
+ list_GG.append('OK_same_score')
283
+ else:
284
+ list_best_target.append(a_row[1]['fw_target'])
285
+ list_best_location.append(a_row[1]['fw_location'])
286
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
287
+ list_delete.append(a_row[1]['fw_deletion'])
288
+ list_insert.append(a_row[1]['fw_insertion'])
289
+ list_mismat.append(a_row[1]['fw_mismatch'])
290
+ list_GG.append('NO_same_score')
291
+ # 记入 df_candidate
292
+ df_candidate['deletion'] = list_delete
293
+ df_candidate['insertion'] = list_insert
294
+ df_candidate['mismatch'] = list_mismat
295
+ df_candidate['GG'] = list_GG
296
+ df_candidate['best_target'] = list_best_target
297
+ df_candidate['target_location'] = list_best_location
298
+ df_candidate['cleavage_site'] = list_cleavage_site
299
+
300
+ # 2.0 更新一下格式
301
+ df_candidate = df_candidate.drop_duplicates(subset=['target_location']).reset_index(drop=True)
302
+ df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate],axis=1)
303
+ # df_candidate['midpoint'] = ((df_candidate['ed'] + df_candidate['st'])/2).astype(int)
304
+ df_candidate = xseq.add_ID(df_candidate, midpoint='cleavage_site')
305
+
306
+ df_candidate.to_csv(dir_df_candidate)
307
+ print(f'Output df_candidate_{sgRNA_name}.csv')
308
+ os.remove(temp_bed)
309
+
310
+ return 'Done!'
311
+
312
+
313
+ if __name__ == '__main__' :
314
+ result = main()
315
+ print(result)
316
+
317
+
318
+
@@ -1,20 +1,22 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # 2023.08.11. v1.1 adding a option for not normalizing the bw file
4
+ # 2023.08.11. adding a option for not normalizing the bw file
5
+ # 2025.05.22. refine the structure
6
+ # 2025.06.05. 增加 ignore_chr 选项,默认只取 common chromosomes,用于 1.1_bed2fr.py
5
7
 
6
8
  import argparse
7
9
  import os, glob, yaml
8
10
  import pandas as pd
9
11
  import shutil, re
10
12
  import offtracker
13
+ import offtracker.X_sequence as xseq
11
14
  script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
12
- script_folder= os.path.join(script_dir, 'mapping')
13
- os.chmod( os.path.join(script_folder, 'bedGraphToBigWig'), 0o755)
15
+ utility_dir = os.path.join(script_dir, 'utility')
14
16
 
15
17
  ###
16
18
  parser = argparse.ArgumentParser()
17
- parser.description='Mapping fastq files of Track-seq.'
19
+ parser.description='Mapping fastq files of Tracking-seq.'
18
20
  parser.add_argument('-f','--folder', type=str, required=True, help='Directory of the input folder' )
19
21
  parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
20
22
  parser.add_argument('-i','--index' , type=str, required=True, help='The index file of chromap')
@@ -25,12 +27,13 @@ parser.add_argument('-t','--thread', type=int, default=4, help='Number of t
25
27
  parser.add_argument('--blacklist' , type=str, default='same', help='Blacklist of genome regions in bed format. "none" for no filter')
26
28
  parser.add_argument('--binsize' , type=str, default=100, help='Bin size for calculating bw residue')
27
29
  parser.add_argument('--normalize' , type=str, default='True', help='Whether to normalize the BigWig file. "True" or "False"')
30
+ parser.add_argument('--ignore_chr' , action='store_true', help='If not set, only chr1-chr22, chrX, chrY, chrM will be analyzed.')
28
31
 
29
- args = parser.parse_args()
30
32
 
33
+ args = parser.parse_args()
31
34
 
32
35
  if (args.genome == 'hg38') or (args.genome == 'mm10'):
33
- dir_chrom_sizes = os.path.join(script_folder, f'{args.genome}.chrom.sizes')
36
+ dir_chrom_sizes = os.path.join(utility_dir, f'{args.genome}.chrom.sizes')
34
37
  else:
35
38
  dir_chrom_sizes = args.genome
36
39
 
@@ -42,7 +45,7 @@ if args.blacklist == 'same':
42
45
  args.blacklist = args.genome
43
46
 
44
47
  if (args.blacklist == 'hg38') or (args.blacklist == 'mm10'):
45
- blacklist = os.path.join(script_folder, f'offtracker_blacklist_{args.blacklist}.merged.bed')
48
+ blacklist = os.path.join(utility_dir, f'offtracker_blacklist_{args.blacklist}.merged.bed')
46
49
  else:
47
50
  blacklist = args.blacklist
48
51
 
@@ -52,59 +55,40 @@ else:
52
55
  if not os.path.exists(args.outdir):
53
56
  os.makedirs(args.outdir)
54
57
 
55
- gz_R2 = []
56
- for fastq in ['*2.*fq','*2.*fastq','*2.*fq.gz','*2.*fastq.gz']:
57
- fq_files = glob.glob( os.path.join(args.folder, args.subfolder*'*/', fastq ) )
58
- print('{} {} samples detected'.format( len(fq_files), fastq[4:] ) )
59
- gz_R2.extend( fq_files )
60
-
61
- gz_R2.sort()
62
- gz_R2 = pd.Series(gz_R2)
63
- suffix = gz_R2.str.extract('(fastq.*|fq.*)',expand=False)
64
- prefix = gz_R2.str.extract('(.*)(?:.fq|.fastq)',expand=False)
65
-
66
- nametype = None
67
- for a_type in ['_trimmed_2', '_2_val_2','_R2_val_2','_R2','_2']:
68
- len_type = len(a_type)
69
- if prefix[0][-len_type:] == a_type:
70
- nametype = a_type
71
- sample_dir = prefix.str[:-len_type]
72
- break
73
-
74
- if nametype is None:
75
- # pattern 搜索模式,可能会出 bug
76
- # find "_R2." or "_2." in prefix[0]
77
- pattern = re.compile(r'(_R2\.|_2\.)')
78
- m = pattern.search(prefix[0])
79
- if m:
80
- nametype = prefix[0][m.span()[0]:]
81
- len_type = len(nametype)
82
- sample_dir = prefix.str[:-len_type]
83
-
84
- assert nametype is not None, 'No fastq detected or the file name is invaild!'
85
-
86
- sample_name = sample_dir.apply(os.path.basename)
58
+ if args.ignore_chr:
59
+ args.ignore_chr = '--ignore_chr'
60
+ else:
61
+ args.ignore_chr = ''
62
+
63
+ # 搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
64
+ sample_names, files_R1, files_R2 = xseq.detect_fastq(args.folder, n_subfolder=args.subfolder, NGS_type=args.NGS_type)
65
+
66
+ assert not isinstance(sample_names, str), 'No fastq file is detected!'
87
67
 
88
68
  dict_yaml = {
89
- 'suffix':suffix[0],
90
- 'sample':dict(zip(sample_name,sample_dir)),
69
+ # fastq 信息
70
+ 'files_R1':dict(zip(sample_names,files_R1)),
71
+ 'files_R2':dict(zip(sample_names,files_R2)), # 单端 files_R2=[] 结果会自动为 {}
72
+ 'NGS_type':args.NGS_type,
73
+ # 输入输出文件夹
91
74
  'input_dir':args.folder,
92
75
  'output_dir':args.outdir,
76
+ # 运行参数
93
77
  'thread':args.thread,
94
78
  'index':args.index,
95
79
  'fasta':args.ref,
96
80
  'binsize':args.binsize,
97
81
  'blacklist':blacklist,
98
- 'nametype':nametype,
99
82
  'genomelen':dir_chrom_sizes,
100
83
  'normalize':args.normalize,
101
- 'script_folder':script_folder
84
+ 'utility_dir':utility_dir,
85
+ 'ignore_chr':args.ignore_chr,
102
86
  }
103
87
 
104
88
  with open( os.path.join(args.outdir,'config.yaml'), 'w') as outfile:
105
89
  yaml.dump(dict_yaml, outfile, default_flow_style=False)
106
90
 
107
- snakefile = os.path.join(script_dir, 'mapping/Snakefile_offtracker')
91
+ snakefile = os.path.join(script_dir, 'snakefile/Snakefile_offtracker.smk')
108
92
  shutil.copy(snakefile, os.path.join(args.outdir,'Snakefile'))
109
93
 
110
94
 
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ THIS_VERSION = '0.4.1'
5
+
6
+ import argparse
7
+ import os, glob, yaml
8
+ import pandas as pd
9
+ import shutil, re
10
+ import offtracker
11
+ import offtracker.X_sequence as xseq
12
+
13
+ script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
14
+ utility_dir = os.path.join(script_dir, 'utility')
15
+ os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
16
+
17
+ ###
18
+ parser = argparse.ArgumentParser()
19
+ parser.description=f'xbulk_qc v{THIS_VERSION}. QC and trim fastq files.'
20
+ parser.add_argument('-f','--folder', type=str, required=True, help='Directory of the input folder' )
21
+ parser.add_argument('-o','--outdir', type=str, default='same', help='The output folder')
22
+ parser.add_argument('--subfolder' , type=int, default=0, help='subfolder level')
23
+ parser.add_argument('-t','--thread', type=int, default=8, help='Number of threads to be used')
24
+ parser.add_argument('--NGS_type' , type=str, default='paired-end', help='paired-end or single-end')
25
+
26
+ args = parser.parse_args()
27
+
28
+ # 自动化的参数调整和报错
29
+ if args.outdir == 'same':
30
+ args.outdir = os.path.join(args.folder,'Trimmed_data')
31
+ if not os.path.exists( args.outdir ):
32
+ os.makedirs( args.outdir )
33
+ else:
34
+ if not os.path.exists(args.outdir):
35
+ os.makedirs(args.outdir)
36
+
37
+ # 搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
38
+ sample_names, files_R1, files_R2 = xseq.detect_fastq(args.folder, n_subfolder=args.subfolder, NGS_type=args.NGS_type)
39
+
40
+ assert not isinstance(sample_names, str), 'No fastq file is detected!'
41
+
42
+ dict_yaml = {
43
+ # fastq 信息
44
+ 'files_R1':dict(zip(sample_names,files_R1)),
45
+ 'files_R2':dict(zip(sample_names,files_R2)), # 单端 files_R2=[] 结果会自动为 {}
46
+ 'NGS_type':args.NGS_type,
47
+ # 输入输出文件夹
48
+ 'input_dir':args.folder,
49
+ 'output_dir':args.outdir,
50
+ # 运行参数
51
+ 'thread':args.thread,
52
+ 'utility_dir':utility_dir
53
+ }
54
+
55
+
56
+ with open( os.path.join(args.outdir,'config.yaml'), 'w', encoding='utf-8') as outfile:
57
+ yaml.dump(dict_yaml, outfile, default_flow_style=False)
58
+
59
+ snakefile = os.path.join(script_dir, 'snakefile/Snakefile_QC.smk')
60
+ shutil.copy(snakefile, os.path.join(args.outdir,'Snakefile'))
61
+
62
+
@@ -11,7 +11,7 @@ from setuptools import find_packages, setup, Command
11
11
  NAME = 'offtracker'
12
12
  DESCRIPTION = 'Tracking-seq data analysis'
13
13
  AUTHOR = 'Runda Xu'
14
- EMAIL = 'runda.xu@foxmail.com'
14
+ EMAIL = 'xrd18@tsinghua.org.cn'
15
15
  URL = 'https://github.com/Lan-lab/offtracker'
16
16
  REQUIRES_PYTHON = '>=3.6.0'
17
17
 
@@ -47,9 +47,10 @@ setup(
47
47
  author_email=EMAIL,
48
48
  url=URL,
49
49
  python_requires=REQUIRES_PYTHON,
50
- packages=find_packages(),
51
- package_data={'offtracker': ['mapping/*']},
52
- scripts = ['scripts/offtracker_config.py',
50
+ packages=['offtracker'],
51
+ package_data={'offtracker': ['snakefile/*','utility/*']},
52
+ scripts = ['scripts/offtracker_qc.py',
53
+ 'scripts/offtracker_config.py',
53
54
  'scripts/offtracker_candidates.py',
54
55
  'scripts/offtracker_analysis.py',
55
56
  'scripts/offtracker_plot.py'],