offtracker 1.0.1__zip → 2.7.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {offtracker-1.0.1/offtracker.egg-info → offtracker-2.7.7}/PKG-INFO +13 -6
  2. {offtracker-1.0.1 → offtracker-2.7.7}/README.md +12 -5
  3. offtracker-2.7.7/offtracker/X_offplot.py +123 -0
  4. offtracker-2.7.7/offtracker/X_offtracker.py +338 -0
  5. offtracker-1.0.1/offtracker/X_general.py → offtracker-2.7.7/offtracker/X_sequence.py +18 -5
  6. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/__init__.py +1 -1
  7. offtracker-2.7.7/offtracker/_version.py +27 -0
  8. offtracker-2.7.7/offtracker/mapping/Snakefile_offtracker +245 -0
  9. offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +3846 -0
  10. offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +5827 -0
  11. {offtracker-1.0.1 → offtracker-2.7.7/offtracker.egg-info}/PKG-INFO +13 -6
  12. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/SOURCES.txt +4 -3
  13. offtracker-2.7.7/scripts/offtracker_analysis.py +369 -0
  14. {offtracker-1.0.1 → offtracker-2.7.7}/scripts/offtracker_candidates.py +59 -101
  15. {offtracker-1.0.1 → offtracker-2.7.7}/scripts/offtracker_config.py +15 -10
  16. offtracker-1.0.1/offtracker/X_analysis.py +0 -332
  17. offtracker-1.0.1/offtracker/_version.py +0 -1
  18. offtracker-1.0.1/offtracker/mapping/Snakefile_Trackseq +0 -193
  19. offtracker-1.0.1/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +0 -22228
  20. offtracker-1.0.1/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +0 -9347
  21. offtracker-1.0.1/scripts/offtracker_analysis.py +0 -407
  22. {offtracker-1.0.1 → offtracker-2.7.7}/LICENSE.txt +0 -0
  23. {offtracker-1.0.1 → offtracker-2.7.7}/MANIFEST.in +0 -0
  24. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/1.1_bed2fr_v4.5.py +0 -0
  25. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/1.3_bdg_normalize_v4.0.py +0 -0
  26. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/bedGraphToBigWig +0 -0
  27. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/hg38.chrom.sizes +0 -0
  28. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/mm10.chrom.sizes +0 -0
  29. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/dependency_links.txt +0 -0
  30. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/requires.txt +0 -0
  31. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/top_level.txt +0 -0
  32. {offtracker-1.0.1 → offtracker-2.7.7}/setup.cfg +0 -0
  33. {offtracker-1.0.1 → offtracker-2.7.7}/setup.py +0 -0
@@ -1,407 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- import os,glob,sys,re,time,shutil
5
-
6
- if sys.version_info < (3,0):
7
- import platform
8
- raise Exception(f'python3 is needed, while running {platform.python_version()} now')
9
-
10
- import offtracker
11
- from offtracker.X_analysis import *
12
- script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
13
- script_folder= os.path.join(script_dir, 'mapping')
14
-
15
- import argparse
16
- import pandas as pd
17
- import numpy as np
18
- import pybedtools
19
- from scipy.stats import norm
20
-
21
- def main():
22
- parser = argparse.ArgumentParser()
23
- parser.description='Analyze the ssChIP-seq data.'
24
- parser.add_argument('-f','--folder' , type=str, required=True, nargs='+', help='Directory of the data folder.' )
25
- parser.add_argument('--seqfolder' , type=str, required=True, help='Directory of sgRNA information created by seq_cadidates.')
26
- parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
27
- parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
28
- parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
29
- parser.add_argument('--regions' , type=str, default='auto', nargs='+', help='Regions around candidate sites.' )
30
- parser.add_argument('--rgO' , type=str, default='mean', help='Regoins operation. mean/min/max for regions' )
31
- parser.add_argument('--repO' , type=str, default='mean', help='Replicate operation for multiple experimental samples (mean/min/max)' )
32
- parser.add_argument('--repOC' , type=str, default='mean', help='Replicate operation for multiple control samples (mean/max)' )
33
- parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
34
- parser.add_argument('-o','--outdir' , type=str, default='first',help='The output folder. Default is the first folder of --folder' )
35
- parser.add_argument('--outname' , type=str, default='same', help='The suffix of output files. Default is the same --exp' )
36
- parser.add_argument('--overwrite' , action='store_true', help='Whether to overwrite existed dataframes.' )
37
- parser.add_argument('--NoShapeBonus', action='store_true', help='Disable shape bonus for each site.' )
38
- parser.add_argument('--SignalFormula',type=int, default=4, help='The signal formula' )
39
- parser.add_argument('--ScoreFormula', type=int, default=3, help='The score formula' )
40
- parser.add_argument('--ShapeFormula', type=int, default=2, help='The shape formula' )
41
- parser.add_argument('--ShapeThresh', type=str, default='auto', help='The threshold for shape formula' )
42
- parser.add_argument('--ShapeWeight', type=str, default='auto', help='The weight for shape bonus' )
43
- parser.add_argument('--SeqScorePower',type=float, default=2, help='The seq score power' )
44
- parser.add_argument('--ScorePseudocount',type=float, default=0.9, help='The score pseudocount' )
45
- parser.add_argument('--RescueFactor',type=float, default=1.2, help='The rescue factor' )
46
- parser.add_argument('--RescueThresh1',type=float, default=1.5, help='The rescue threshold for min raw' )
47
- parser.add_argument('--RescueThresh2',type=float, default=1.8, help='The rescue threshold for raw score' )
48
- parser.add_argument('--clean', action='store_true', help='Whether to remove temp files')
49
-
50
- print(f'Runing offtracker verision: {offtracker.__version__}')
51
- # main parameters
52
- args = parser.parse_args()
53
- folders = args.folder
54
- seq_folder = args.seqfolder
55
- sgRNA_name = args.name
56
- control_mark = args.control
57
- exp_mark = args.exp
58
- ctr_samples = []
59
- outdir = args.outdir
60
- if outdir == 'first':
61
- outdir = folders[0]
62
- os.chdir(outdir)
63
- for a_folder in folders:
64
- temp_dir = os.path.join(a_folder, 'temp')
65
- if not os.path.exists( temp_dir ):
66
- os.makedirs(temp_dir)
67
-
68
- # parameters for algorithm
69
- operator = 'p'
70
-
71
- dict_signal_formula = {1:signal_formula1, 2:signal_formula2, 3:signal_formula3, 4:signal_formula4}
72
- dict_score_formula = {1:score_formula1, 2:score_formula2, 3:score_formula3, 4:score_formula4, 5:score_formula5}
73
- dict_shape_formula = {1:shape_formula1, 2:shape_formula2}
74
-
75
- the_signal_formula = dict_signal_formula[args.SignalFormula]
76
- if control_mark == 'none':
77
- print('No control samples, all samples in the folder are regarded as experimental samples.')
78
- the_score_formula = score_formula5
79
- else:
80
- the_score_formula = dict_score_formula[args.ScoreFormula]
81
- if args.NoShapeBonus:
82
- the_shape_formula = None
83
- print('the_shape_formula is None')
84
- else:
85
- the_shape_formula = dict_shape_formula[args.ShapeFormula]
86
-
87
- if args.ShapeFormula == 1:
88
- ratio_thresh = 1.5
89
- exp_weight = 0.3
90
- elif args.ShapeFormula == 2:
91
- ratio_thresh = 1
92
- exp_weight = 0.5
93
-
94
- if args.ShapeThresh != 'auto':
95
- ratio_thresh = float(args.ShapeThresh)
96
- if args.ShapeWeight != 'auto':
97
- exp_weight = float(args.ShapeWeight)
98
-
99
- seq_score_power = args.SeqScorePower
100
- if args.regions == 'auto':
101
- regions = [500, 1000, 2000, 3000]
102
- else:
103
- regions = list(map(int, args.regions))
104
- print('regions:',regions)
105
- bkgs = ['bkg1','bkg2','bkg3']
106
- noise_length = 5000
107
- max_region = max(regions)
108
- region_op = args.rgO
109
- repO = args.repO
110
- repOC = args.repOC
111
- score_pseudo = args.ScorePseudocount
112
- rescue_factor = args.RescueFactor
113
- assert repO in ['mean','min','max'], "--repO only accepts \"mean\",\"min\",or \"max\" "
114
- assert repOC in ['mean','max'], "--repO only accepts \"mean\" or \"max\" "
115
- print('repO',repO)
116
- print('repOC',repOC)
117
- print('ShapeThresh:',ratio_thresh)
118
- print('ShapeWeight:',exp_weight)
119
- print('SignalFormula:',args.SignalFormula)
120
- print('ScoreFormula:',args.ScoreFormula)
121
- print('ShapeFormula:',args.ShapeFormula)
122
- print('SeqScorePower:',args.SeqScorePower)
123
- print('ScorePseudocount:',args.ScorePseudocount)
124
- print('RescueFactor:',args.RescueFactor)
125
- # glob samples, check paired
126
- all_samples_name = []
127
- all_samples_pref = []
128
- for a_folder in folders:
129
- forward_bed_files = glob.glob( os.path.join( a_folder, '*.fw.bed' ) )
130
- forward_bed_files.sort()
131
- reverse_bed_files = glob.glob( os.path.join( a_folder, '*.rv.bed' ) )
132
- reverse_bed_files.sort()
133
- forward_samples = pd.Series(forward_bed_files).apply(os.path.basename).str[:-7]
134
- reverse_samples = pd.Series(reverse_bed_files).apply(os.path.basename).str[:-7]
135
- assert (forward_samples == reverse_samples).all()
136
- all_samples_name.extend( forward_samples )
137
- all_samples_pref.extend( pd.Series(forward_bed_files).str[:-7] )
138
- all_samples_pref = pd.Series(all_samples_pref)
139
- all_samples_name = pd.Series(all_samples_name)
140
-
141
- if control_mark == 'none':
142
- if exp_mark == 'all':
143
- exp_samples = list( all_samples_name )
144
- else:
145
- exp_samples = []
146
- for a_mark in exp_mark:
147
- exp_samples.extend( list( all_samples_name[all_samples_name.str.contains(a_mark)] ) )
148
- elif control_mark == 'others':
149
- if exp_mark == 'all':
150
- exp_samples = list( all_samples_name )
151
- else:
152
- exp_samples = []
153
- for a_mark in exp_mark:
154
- exp_samples.extend( list( all_samples_name[all_samples_name.str.contains(a_mark)] ) )
155
- ctr_samples = list( all_samples_name[~all_samples_name.isin(exp_samples)] )
156
- else:
157
- for a_mark in control_mark:
158
- ctr_samples.extend( list( all_samples_name[all_samples_name.str.contains(a_mark)] ) )
159
- if exp_mark == 'all':
160
- exp_samples = list( all_samples_name[~all_samples_name.isin(ctr_samples)] )
161
- else:
162
- exp_samples = []
163
- for a_mark in exp_mark:
164
- exp_samples.extend( list( all_samples_name[all_samples_name.str.contains(a_mark)] ) )
165
- n_exp = len(exp_samples)
166
- n_ctr = len(ctr_samples)
167
- print(f'Experimental group has {n_exp} samples:\n{exp_samples}')
168
- print(f'Control group has {n_ctr} samples:\n{ctr_samples}')
169
-
170
- # mark 错误时
171
- assert n_exp > 0, 'No experimental sample is found. Please check the name pattern.'
172
- if (n_ctr==0)&(control_mark != 'none'):
173
- print('Name pattern for control sample(s) was given, but no file meet the pattern.')
174
- return 'Program terminated'
175
-
176
- # sequence score
177
- try:
178
- df_alignment = pd.read_csv(os.path.join(seq_folder, f'df_alignment_{sgRNA_name}_{max_region}.csv'), index_col=0)
179
- except FileNotFoundError:
180
- return 'Please run offtracker_candidates.py first and provide the correct directory with --seqfolder'
181
-
182
- # chromosome sizes
183
- if (args.genome == 'hg38') or (args.genome == 'mm10'):
184
- dir_chrom_sizes = os.path.join(script_folder,f'{args.genome}.chrom.sizes')
185
- else:
186
- dir_chrom_sizes = args.genome
187
-
188
- # sgRNA length
189
- sgRNA_fa = os.path.join(seq_folder, sgRNA_name + '_PAM.fasta')
190
- if not os.path.isfile(sgRNA_fa):
191
- raise Exception(f'{sgRNA_name}_PAM.fasta is not in the seqfolder')
192
- with open(sgRNA_fa,'r') as f:
193
- temp = f.readlines()
194
- len_sgRNA_PAM = len(temp[1].strip())
195
- #
196
- selected_samples_pref = all_samples_pref[all_samples_name.isin(exp_samples+ctr_samples)]
197
- selected_samples_name = all_samples_name[all_samples_name.isin(exp_samples+ctr_samples)]
198
- # read counts on candidate regions
199
- for a_pref in selected_samples_pref:
200
- cand_count(a_pref, sgRNA_name, regions, seq_folder, dir_chrom_sizes, overwrite=False)
201
- bkg_count(a_pref, sgRNA_name, bkgs, seq_folder, dir_chrom_sizes, overwrite=False)
202
-
203
-
204
- ############
205
- ## signal ##
206
- ############
207
- if args.outname == 'same':
208
- outname = exp_mark
209
- else:
210
- outname = args.outname
211
- output = f'./temp/df_pivot_{outname}.csv'
212
- if (os.path.isfile(output))&(not args.overwrite):
213
- print(f'skip {output}')
214
- else:
215
- # 计算每个样本的 df_counts
216
- dict_df_counts = {}
217
- for a_pref, a_sample in zip(selected_samples_pref, selected_samples_name):
218
- dirname = os.path.dirname(a_pref)
219
- basename = os.path.basename(a_pref)
220
- temp_dir = os.path.join(dirname, 'temp')
221
- a_pref = os.path.join(temp_dir, basename)
222
- dir_df_counts = f'{a_pref}_{sgRNA_name}_df_counts_{the_signal_formula.__name__}_{operator}.csv'
223
- if (os.path.isfile(dir_df_counts))&(not args.overwrite):
224
- print(f'df_counts of {a_sample}_{sgRNA_name}_{the_signal_formula.__name__}_{operator} exists, loading.\n')
225
- df_counts_temp = pd.read_csv(dir_df_counts, index_col=0)
226
- else:
227
- print(f'For {a_sample}')
228
- df_counts_temp, df_noise_temp = load_count(a_sample, regions, bkgs, sgRNA_name = sgRNA_name, signal_formula = the_signal_formula, noise_length=noise_length,
229
- ratio_thresh=ratio_thresh, exp_weight=exp_weight, shape_formula=the_shape_formula, operator = operator,
230
- region_op = region_op, dirname = temp_dir, pseudo_count=1)
231
- print('\n')
232
- df_counts_temp['noise_1kb'] = df_noise_temp['noise_bp']*1000
233
- df_counts_temp.to_csv(dir_df_counts)
234
- dict_df_counts[a_sample] = df_counts_temp
235
- # 简化 df_counts 再合并
236
- dict_df_counts_sub = {}
237
- for a_sample in selected_samples_name:
238
- df_counts_sub = dict_df_counts[a_sample].copy()
239
- df_counts_sub = df_counts_sub[['chr','st','ed','midpoint','location','ID_1','ID_2','left_signal','right_signal','signal_FC', 'signal_min','shape_bonus','score']].copy()
240
- df_counts_sub['sample'] = a_sample
241
- dict_df_counts_sub[a_sample] = df_counts_sub
242
-
243
- # 合成 pivot 矩阵
244
- df_pivot = pd.concat(dict_df_counts_sub).pivot(index='location', columns='sample', values='score').fillna(0)
245
- df_shape = pd.concat(dict_df_counts_sub).pivot(index='location', columns='sample', values='shape_bonus').fillna(0)
246
- df_signal_FC = pd.concat(dict_df_counts_sub).pivot(index='location', columns='sample', values='signal_FC').fillna(0)
247
- df_signal_min = pd.concat(dict_df_counts_sub).pivot(index='location', columns='sample', values='signal_min').fillna(0)
248
-
249
- df_pivot['std_all'] = df_pivot.std(axis=1)
250
- if control_mark != 'none':
251
- df_pivot['control_mean'] = df_pivot[ctr_samples].mean(axis=1)
252
- df_pivot['control_max'] = df_pivot[ctr_samples].max(axis=1)
253
- df_pivot['control_std'] = df_pivot[ctr_samples].std(axis=1)
254
- df_pivot['control_signal_FC'] = df_signal_FC[ctr_samples].mean(axis=1)
255
- df_pivot['control_signal_min'] = df_signal_min[ctr_samples].mean(axis=1)
256
- if repOC == 'mean':
257
- df_pivot['control_raw'] = df_pivot['control_mean']
258
- df_pivot['control_shape'] = df_shape[ctr_samples].mean(axis=1)
259
- elif repOC == 'max':
260
- df_pivot['control_raw'] = df_pivot['control_max']
261
- df_pivot['control_shape'] = df_shape[ctr_samples].max(axis=1)
262
- df_pivot['control_signal'] = df_pivot['control_raw']*df_pivot['control_shape']
263
- else:
264
- df_pivot['control_signal'] = 'no_control'
265
-
266
- # group mean/min/max
267
- if repO == 'mean':
268
- df_pivot['exp_raw'] = df_pivot[exp_samples].mean(axis=1)
269
- df_pivot['exp_shape'] = df_shape[exp_samples].mean(axis=1)
270
- elif repO == 'min':
271
- df_pivot['exp_raw'] = df_pivot[exp_samples].min(axis=1)
272
- df_pivot['exp_shape'] = df_shape[exp_samples].min(axis=1)
273
- elif repO == 'max':
274
- df_pivot['exp_raw'] = df_pivot[exp_samples].max(axis=1)
275
- df_pivot['exp_shape'] = df_shape[exp_samples].max(axis=1)
276
- df_pivot['exp_signal_FC'] = df_signal_FC[exp_samples].mean(axis=1)
277
- df_pivot['exp_signal_min'] = df_signal_min[exp_samples].mean(axis=1)
278
- df_pivot['exp_signal'] = df_pivot['exp_raw']*df_pivot['exp_shape']
279
-
280
- # 添加坐标 和 ID
281
- bed_pivot = bedfmt(df_pivot.index)
282
- bed_pivot.index = df_pivot.index
283
- df_pivot = pd.concat([bed_pivot,df_pivot], axis=1)
284
- df_pivot['midpoint'] = df_pivot['st']+max_region
285
-
286
- point_head = (df_pivot['midpoint']/1000).astype(int)
287
- df_pivot['ID_1'] = df_pivot['chr'] + ':' + point_head.astype(str)
288
- point_tail = df_pivot['midpoint'] % 1000
289
- df_pivot.loc[point_tail<500,'ID_2'] = df_pivot['chr'] + ':' + (point_head-1).astype(str)
290
- df_pivot.loc[point_tail>=500,'ID_2'] = df_pivot['chr'] + ':' + (point_head+1).astype(str)
291
-
292
- df_alignment_sub = df_alignment.loc[df_pivot.index, ['best_strand','best_target','target_location','deletion','insertion','mismatch','GG','alignment_score','best_seq_score']]
293
- df_pivot = pd.concat([df_pivot,df_alignment_sub],axis=1)
294
- df_pivot.to_csv(output)
295
- ###########
296
- ## score ##
297
- ###########
298
-
299
- output = f'./temp/df_pivot_score_{outname}.csv'
300
- if (os.path.isfile(output))&(not args.overwrite):
301
- print(f'skip {output}')
302
- df_pivot = pd.read_csv(output, index_col=0)
303
- else:
304
- df_pivot = pd.read_csv(f'./temp/df_pivot_{outname}.csv', index_col=0)
305
- df_pivot[f'raw_score'] = df_pivot[['exp_signal','control_signal']].apply(lambda x: the_score_formula(x['exp_signal'], x['control_signal'], score_pseudo), axis=1 )
306
- mean_seq_score = round(df_pivot['best_seq_score'].mean(),3)
307
- df_pivot['norm_best_seq_score'] = np.power(df_pivot['best_seq_score']/mean_seq_score, seq_score_power)
308
- df_pivot[f'final_score'] = df_pivot[f'raw_score']*df_pivot['norm_best_seq_score']
309
- # record the final score before rescue
310
- df_pivot['final_score_before_rescue'] = df_pivot['final_score'].copy()
311
-
312
- # raw rescue
313
- df_raw_rescue = df_pivot[df_pivot['raw_score']>1.5].sort_values(by='raw_score', ascending=False)
314
- df_raw_rescue['min_raw'] = df_raw_rescue[exp_samples].min(axis=1)/df_raw_rescue['control_max'].clip(lower=0.5)
315
- # 对 min_raw rescue
316
- index_raw_rescue = df_raw_rescue[df_raw_rescue['min_raw']>args.RescueThresh1].index
317
- # 直接对 raw 进行 rescue
318
- index_raw_rescue_2 = df_raw_rescue[df_raw_rescue['raw_score']>args.RescueThresh2].index
319
- index_raw_rescue = np.union1d(index_raw_rescue,index_raw_rescue_2)
320
- print('rescue',len(index_raw_rescue),'sites')
321
- df_pivot['rescue_factor'] = 1
322
- df_pivot.loc[index_raw_rescue,'rescue_factor'] = rescue_factor
323
- df_pivot.loc[index_raw_rescue,f'final_score'] = df_pivot.loc[index_raw_rescue,f'final_score']*rescue_factor
324
-
325
- # dedup
326
- df_pivot = df_pivot.sort_values(by=f'final_score', ascending=False)
327
- list_nondup = dedup_two(df_pivot,'ID_1','ID_2')
328
- df_pivot = df_pivot[list_nondup]
329
-
330
- target_std=0.15
331
- df_pivot = df_pivot[df_pivot[f'final_score']>0].copy()
332
- n_bkg_sites = int(len(df_pivot)*0.99)
333
- score_bkg = df_pivot['final_score'][-n_bkg_sites:]
334
- mean_score_bkg = score_bkg.mean()
335
- std_score_bkg = score_bkg.std()
336
- df_pivot['norm_final_score'] = (df_pivot[f'final_score'] - mean_score_bkg) / std_score_bkg
337
- df_pivot['norm_final_score'] = df_pivot[f'norm_final_score']*target_std + 1
338
- df_pivot['norm_final_score'] = df_pivot['norm_final_score'].clip(lower=0)
339
- df_pivot['log2_norm_final_score'] = np.log2(df_pivot[f'norm_final_score']+1)
340
-
341
- # fitting normal distribution
342
- score_for_fitting = df_pivot['log2_norm_final_score'][-n_bkg_sites:]
343
- mu, std = norm.fit(score_for_fitting)
344
- print('mean_score:{:.3f};std:{:.3f}'.format(mu,std))
345
- # pv and fdr
346
- df_pivot['pv'] = df_pivot[f'log2_norm_final_score'].apply( lambda x: norm.sf(x,loc=mu,scale=std) )
347
- df_pivot['pv'].clip(lower=1e-320,inplace=True)
348
- df_pivot.to_csv(output)
349
-
350
- ############
351
- ## filter ##
352
- ############
353
-
354
- signal_min_thresh = -0.1
355
- signal_min_thresh_2 = -2
356
- search_distance = 40000
357
- seq_score_thresh = len_sgRNA_PAM*2 - 5*2
358
-
359
- df_result = df_pivot.copy()
360
- ##
361
- candidate_dup = df_result[ df_result[f'exp_signal_min']<=signal_min_thresh ].index[:500]
362
- list_dedup = []
363
- list_seq_score = []
364
- for a_loc in candidate_dup:
365
- temp_chr = df_result.loc[a_loc,'chr']
366
- temp_seq_score = df_result.loc[a_loc,'best_seq_score']
367
- # exp_signal_min 太低直接跳过得分判断
368
- if df_result.loc[a_loc,'exp_signal_min'] <= signal_min_thresh_2 :
369
- pass
370
- else:
371
- # 高于 X 分则跳过
372
- if temp_seq_score > seq_score_thresh:
373
- continue
374
- # 取排其前且同chr者
375
- temp_df_result = df_result.loc[:a_loc]
376
- temp_df_result = temp_df_result[temp_df_result['chr'] == temp_chr].iloc[:-1]
377
- if len(temp_df_result)==0:
378
- continue
379
- else:
380
- # 距离小于 search_distance 者记为信号溢出假阳性
381
- if (temp_df_result['midpoint']-df_result.loc[a_loc,'midpoint']).abs().min()<search_distance :
382
- list_dedup.append(a_loc)
383
- list_seq_score.append(temp_seq_score)
384
-
385
- df_result = df_result[~df_result.index.isin(list_dedup)].copy()
386
- print(f'filter {len(list_dedup)} sites')
387
-
388
- df_result['rank'] = range(1,len(df_result)+1)
389
- df_result['fdr'] = fdr(df_result['pv'])
390
- df_result.to_csv(f'./temp/df_result_{outname}.csv')
391
-
392
- df_output = df_result[df_result['log2_norm_final_score']>=1.3].copy()
393
- df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch','norm_best_seq_score','norm_final_score', 'log2_norm_final_score','fdr','rank']]
394
- df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch', 'seq_score', 'track_score', 'log2(track_score+1)','FDR', 'rank']
395
- df_output.to_csv(f'Trackseq_result_{outname}.csv', index=False)
396
-
397
- if args.clean:
398
- shutil.rmtree('./temp')
399
-
400
- return 'Done!'
401
-
402
-
403
- if __name__ == '__main__' :
404
- result = main()
405
- print(result)
406
-
407
-
File without changes
File without changes
File without changes
File without changes