offtracker 1.0.1__zip → 2.7.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {offtracker-1.0.1/offtracker.egg-info → offtracker-2.7.7}/PKG-INFO +13 -6
  2. {offtracker-1.0.1 → offtracker-2.7.7}/README.md +12 -5
  3. offtracker-2.7.7/offtracker/X_offplot.py +123 -0
  4. offtracker-2.7.7/offtracker/X_offtracker.py +338 -0
  5. offtracker-1.0.1/offtracker/X_general.py → offtracker-2.7.7/offtracker/X_sequence.py +18 -5
  6. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/__init__.py +1 -1
  7. offtracker-2.7.7/offtracker/_version.py +27 -0
  8. offtracker-2.7.7/offtracker/mapping/Snakefile_offtracker +245 -0
  9. offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +3846 -0
  10. offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +5827 -0
  11. {offtracker-1.0.1 → offtracker-2.7.7/offtracker.egg-info}/PKG-INFO +13 -6
  12. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/SOURCES.txt +4 -3
  13. offtracker-2.7.7/scripts/offtracker_analysis.py +369 -0
  14. {offtracker-1.0.1 → offtracker-2.7.7}/scripts/offtracker_candidates.py +59 -101
  15. {offtracker-1.0.1 → offtracker-2.7.7}/scripts/offtracker_config.py +15 -10
  16. offtracker-1.0.1/offtracker/X_analysis.py +0 -332
  17. offtracker-1.0.1/offtracker/_version.py +0 -1
  18. offtracker-1.0.1/offtracker/mapping/Snakefile_Trackseq +0 -193
  19. offtracker-1.0.1/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +0 -22228
  20. offtracker-1.0.1/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +0 -9347
  21. offtracker-1.0.1/scripts/offtracker_analysis.py +0 -407
  22. {offtracker-1.0.1 → offtracker-2.7.7}/LICENSE.txt +0 -0
  23. {offtracker-1.0.1 → offtracker-2.7.7}/MANIFEST.in +0 -0
  24. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/1.1_bed2fr_v4.5.py +0 -0
  25. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/1.3_bdg_normalize_v4.0.py +0 -0
  26. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/bedGraphToBigWig +0 -0
  27. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/hg38.chrom.sizes +0 -0
  28. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/mm10.chrom.sizes +0 -0
  29. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/dependency_links.txt +0 -0
  30. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/requires.txt +0 -0
  31. {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/top_level.txt +0 -0
  32. {offtracker-1.0.1 → offtracker-2.7.7}/setup.cfg +0 -0
  33. {offtracker-1.0.1 → offtracker-2.7.7}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 1.0.1
3
+ Version: 2.7.7
4
4
  Summary: Track-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -79,8 +79,11 @@ offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
79
79
  -r /Your_Path_To_Reference/hg38_genome.fa \
80
80
  -i /Your_Path_To_Reference/hg38_genome.chromap.index \
81
81
  -f /Your_Path_To_Fastq \
82
- -o /Your_Path_To_Output \ # Default is outputting to /Your_Path_To_Fastq
83
- --subfolder 0 # If different samples are in seperate folders, set this to 1
82
+ -o /Your_Path_To_Output \
83
+ --subfolder 0
84
+
85
+ # --subfolder: If different samples are in seperate folders, set this to 1
86
+ # -o: Default is outputting to /Your_Path_To_Fastq
84
87
 
85
88
  # Run the snakemake program
86
89
  cd /Your_Path_To_Fastq
@@ -103,13 +106,17 @@ Analyzing the off-target sites
103
106
  ```bash
104
107
  # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recogonization of sample names
105
108
 
106
- offtracker_analysis.py -g hg38 --name "HEK4" \ # the same as that in offtracker_candidates.py
107
- --exp 'Cas9_HEK4.*293' \ # add one or multiple patterns of file name in regex
108
- --control 'control' \ # add one or multiple patterns of file name in regex
109
+ offtracker_analysis.py -g hg38 --name "HEK4" \
110
+ --exp 'Cas9_HEK4.*293' \
111
+ --control 'control' \
109
112
  --outname 'Cas9_HEK4_293' \
110
113
  -f /Your_Path_To_Output \
111
114
  --seqfolder /Your_Path_To_Candidates
112
115
 
116
+ # --name: the same as that in offtracker_candidates.py
117
+ # --exp/--control: add one or multiple patterns of file name in regex
118
+
119
+
113
120
  # This step will generate Trackseq_result_{outname}.csv
114
121
  # Intermediate files are saved in ./temp folder, which can be deleted
115
122
  # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
@@ -2,8 +2,9 @@ LICENSE.txt
2
2
  MANIFEST.in
3
3
  README.md
4
4
  setup.py
5
- offtracker/X_analysis.py
6
- offtracker/X_general.py
5
+ offtracker/X_offplot.py
6
+ offtracker/X_offtracker.py
7
+ offtracker/X_sequence.py
7
8
  offtracker/__init__.py
8
9
  offtracker/_version.py
9
10
  offtracker.egg-info/PKG-INFO
@@ -13,7 +14,7 @@ offtracker.egg-info/requires.txt
13
14
  offtracker.egg-info/top_level.txt
14
15
  offtracker/mapping/1.1_bed2fr_v4.5.py
15
16
  offtracker/mapping/1.3_bdg_normalize_v4.0.py
16
- offtracker/mapping/Snakefile_Trackseq
17
+ offtracker/mapping/Snakefile_offtracker
17
18
  offtracker/mapping/bedGraphToBigWig
18
19
  offtracker/mapping/hg38.chrom.sizes
19
20
  offtracker/mapping/mm10.chrom.sizes
@@ -0,0 +1,369 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os,glob,sys,time,shutil
5
+
6
+ if sys.version_info < (3,0):
7
+ import platform
8
+ raise Exception(f'python3 is needed, while running {platform.python_version()} now')
9
+
10
+ import offtracker
11
+ import offtracker.X_sequence as xseq
12
+ script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
13
+ script_folder= os.path.join(script_dir, 'mapping')
14
+
15
+ import argparse
16
+ import pandas as pd
17
+ import numpy as np
18
+ import multiprocessing as mp
19
+ from scipy.stats import norm
20
+
21
+ def main():
22
+ parser = argparse.ArgumentParser()
23
+ parser.description='Analyze the Tracking-seq data.'
24
+ parser.add_argument('-f','--folder' , type=str, required=True, nargs='+', help='Directory of the data folder.' )
25
+ parser.add_argument('--seqfolder' , type=str, required=True, help='folder containing df_candidate created by offtracker_cadidates.py.')
26
+ parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
27
+ parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
28
+ parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
29
+ parser.add_argument('--smooth' , type=int, default=1, help='Smooth strength for the signal.')
30
+ parser.add_argument('--window' , type=int, default=3, help='Window size for smoothing the signal.')
31
+ parser.add_argument('--binsize' , type=int, default=100, help='Window size for smoothing the signal.')
32
+ parser.add_argument('--flank_max' , type=int, default=100000, help='Maximun flanking distance from the candidate site.')
33
+ parser.add_argument('--flank_regions', type=int, default=[1000,2000,3000,5000], nargs='+',help='flanking regions for calculating signal.')
34
+ parser.add_argument('--SeqScorePower', type=float, default=4, help='The seq score power' )
35
+ parser.add_argument('--CtrClip' , type=float, default=-0.5, help='The lower clip for control samples' )
36
+ parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
37
+ parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
38
+ parser.add_argument('-o','--outdir' , type=str, default='first', help='The output folder. Default is the first folder of --folder' )
39
+ parser.add_argument('--outname' , type=str, default='same', help='The suffix of output files. Default is the same --exp' )
40
+ parser.add_argument('--signal_only' , action='store_true', help='A developer option: stop before group analysis. ' )
41
+ parser.add_argument('--overwrite' , action='store_true', help='Whether to overwrite existed dataframes.' )
42
+ parser.add_argument('--clean' , action='store_true', help='Whether to remove temp files')
43
+
44
+ args = parser.parse_args()
45
+
46
+ print(f'Runing offtracker verision: {offtracker.__version__}')
47
+ # main parameters
48
+ folders = args.folder
49
+ sgRNA_name = args.name
50
+ pattern_exp = args.exp
51
+ pattern_ctr = args.control
52
+ binsize = args.binsize
53
+ flank_max = args.flank_max
54
+ flank_regions = args.flank_regions
55
+ smooth_times = args.smooth
56
+ window_size = args.window
57
+ seq_score_power = args.SeqScorePower
58
+ n_threads = args.thread
59
+
60
+ outdir = args.outdir
61
+ if outdir == 'first':
62
+ outdir = folders[0]
63
+ os.chdir(outdir)
64
+ # out temp folder
65
+ if not os.path.exists( os.path.join(outdir,'temp') ):
66
+ os.makedirs(os.path.join(outdir,'temp'))
67
+ # data temp folder
68
+ for a_folder in folders:
69
+ temp_dir = os.path.join(a_folder, 'temp')
70
+ if not os.path.exists( temp_dir ):
71
+ os.makedirs(temp_dir)
72
+
73
+ # load df_candidate
74
+ try:
75
+ df_candidate = pd.read_csv(os.path.join(args.seqfolder,f'df_candidate_{sgRNA_name}.csv'), index_col=0)
76
+ df_candidate.index = df_candidate['target_location']
77
+ df_candidate_brief = df_candidate[['chr','st','ed','best_strand','best_target','best_seq_score',
78
+ 'deletion', 'insertion','mismatch', 'GG',
79
+ 'target_location', 'cleavage_site', 'ID_1','ID_2']]
80
+ df_candidate_sub = df_candidate[['chr','cleavage_site']]
81
+ except FileNotFoundError:
82
+ return 'Please run offtracker_candidates.py first and provide the correct directory with --seqfolder'
83
+
84
+ ##################
85
+ ## glob samples ##
86
+ ##################
87
+ all_sample_names = []
88
+ all_sample_files = []
89
+ for a_folder in folders:
90
+ bdg_files = pd.Series(glob.glob(os.path.join( a_folder, '*.add.bdg' ))).sort_values().reset_index(drop=True)
91
+ sample_names = bdg_files.apply(os.path.basename).str.extract('(.*)\.\d+\.add\.bdg',expand=False)
92
+ all_sample_names.extend( sample_names )
93
+ all_sample_files.extend( bdg_files )
94
+ all_sample_files = pd.Series(all_sample_files)
95
+ all_sample_names = pd.Series(all_sample_names)
96
+
97
+ ctr_samples = []
98
+ if pattern_ctr == 'none':
99
+ if pattern_exp == 'all':
100
+ exp_samples = list( all_sample_names )
101
+ else:
102
+ exp_samples = []
103
+ for a_mark in pattern_exp:
104
+ exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
105
+ elif pattern_ctr == 'others':
106
+ if pattern_exp == 'all':
107
+ exp_samples = list( all_sample_names )
108
+ else:
109
+ exp_samples = []
110
+ for a_mark in pattern_exp:
111
+ exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
112
+ ctr_samples = list( all_sample_names[~all_sample_names.isin(exp_samples)] )
113
+ else:
114
+ for a_mark in pattern_ctr:
115
+ ctr_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
116
+ if pattern_exp == 'all':
117
+ exp_samples = list( all_sample_names[~all_sample_names.isin(ctr_samples)] )
118
+ else:
119
+ exp_samples = []
120
+ for a_mark in pattern_exp:
121
+ exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
122
+ n_exp = len(exp_samples)
123
+ n_ctr = len(ctr_samples)
124
+ print(f'Experimental group has {n_exp} samples:\n{exp_samples}')
125
+ print(f'Control group has {n_ctr} samples:\n{ctr_samples}')
126
+
127
+ # mark 错误时
128
+ assert n_exp > 0, 'No experimental sample is found. Please check the name pattern.'
129
+ if (n_ctr==0)&(pattern_ctr != 'none'):
130
+ print('Name pattern for control sample(s) was given, but no file meet the pattern.')
131
+ return 'Program terminated'
132
+
133
+ # summarize samples
134
+ bool_exp = all_sample_names.isin(exp_samples)
135
+ bool_ctr = all_sample_names.isin(ctr_samples)
136
+ exp_sample_files = all_sample_files[bool_exp]
137
+ ctr_sample_files = all_sample_files[bool_ctr]
138
+ exp_sample_names = all_sample_names[bool_exp]
139
+ ctr_sample_names = all_sample_names[bool_ctr]
140
+ selected_sample_files = pd.concat([exp_sample_files,ctr_sample_files])
141
+ selected_sample_names = pd.concat([exp_sample_names,ctr_sample_names])
142
+
143
+
144
+ ##########################
145
+ ## calculate the signal ##
146
+ ##########################
147
+
148
+ for a_file, a_name in zip(selected_sample_files, selected_sample_names):
149
+ st = time.time()
150
+ output = os.path.join(outdir, 'temp', a_name + f'.{sgRNA_name}.signal.csv')
151
+ if (os.path.isfile(output))&(not args.overwrite):
152
+ print(output, 'exists, skipped')
153
+ continue
154
+ df_bdg = xseq.read_bed(a_file)
155
+ df_bdg.columns = ['chr','start','end','residual']
156
+ # 将 df_bdg 按照染色体分组
157
+ sample_groups = df_bdg.groupby('chr')
158
+ # 将 df_candidate_sub 按照染色体分组
159
+ candidate_groups = df_candidate_sub.groupby('chr')
160
+
161
+ # 定义一个空的列表,用于存储每个染色体的数据
162
+ chrom_list = []
163
+ # 遍历分组后的数据
164
+ list_index = []
165
+ for chr_name, chr_candidate in candidate_groups:
166
+ # 获取当前染色体对应的 df_sample 数据
167
+ chr_sample = sample_groups.get_group(chr_name)
168
+ # 保留 index
169
+ list_index.extend(list(chr_candidate.index))
170
+ # 将参数数据存储到列表中
171
+ chrom_list.append([chr_sample, chr_candidate, flank_max, smooth_times, window_size, binsize, flank_regions])
172
+
173
+ # 多线程运行
174
+ with mp.Pool(n_threads) as p:
175
+ signal_all = p.starmap(offtracker.target_signal_chunk, chrom_list)
176
+ ed = time.time()
177
+ print(f'{ed-st}s for {a_name} with {n_threads} threads')
178
+ df_signal = pd.concat(signal_all)
179
+ df_signal.index = list_index
180
+ df_signal.to_csv(output)
181
+
182
+ if args.signal_only:
183
+ return 'signal_only is on, stop here.'
184
+
185
+ ####################
186
+ ## group analysis ##
187
+ ####################
188
+ if args.outname == 'same':
189
+ if isinstance(pattern_exp, list):
190
+ outname = '_'.join(pattern_exp)
191
+ else:
192
+ outname = pattern_exp
193
+ else:
194
+ outname = args.outname
195
+
196
+ output = f'./temp/df_score_{outname}.csv'
197
+ if (os.path.isfile(output))&(not args.overwrite):
198
+ print(f'skip {output}')
199
+ df_score = pd.read_csv(output, index_col=0)
200
+ else:
201
+ signal_files = pd.Series(glob.glob( os.path.join(outdir, 'temp', f'*{sgRNA_name}.signal.csv') ))
202
+ signal_names = signal_files.apply(os.path.basename).str.extract(f'(.*)\.{sgRNA_name}\.signal\.csv',expand=False)
203
+
204
+ # 读取并合并 samples
205
+ list_df_exp_samples = []
206
+ list_df_ctr_samples = []
207
+ for a_file, a_name in zip(signal_files, signal_names):
208
+ if a_name in exp_samples:
209
+ df_temp = pd.read_csv(a_file, index_col=0)
210
+ df_temp = df_temp.drop(['chr_cleavage'], axis=1)
211
+ list_df_exp_samples.append(df_temp)
212
+ elif a_name in ctr_samples:
213
+ df_temp = pd.read_csv(a_file, index_col=0)
214
+ df_temp = df_temp.drop(['chr_cleavage'], axis=1)
215
+ list_df_ctr_samples.append(df_temp)
216
+ else:
217
+ pass
218
+
219
+ # 计算每个组内的平均信号
220
+ # 2023.12.07. exp 和 ctr 的信号分开展示
221
+ df_score = df_candidate_brief.copy()
222
+ df_exp = xseq.combine_df(list_df_exp_samples)
223
+ if pattern_ctr != 'none':
224
+ df_ctr = xseq.combine_df(list_df_ctr_samples)
225
+ # 2023.12.10. 给 control 除了 'neg' 特征外的负数范围 clip,防止 exp-ctr 因此出现假阳性
226
+ # 2023.12.31. 将 clip 范围由 -5 改为 -1
227
+ # 2024.01.02. clip 模块移动到 filter and normalize
228
+ # cols_clip = df_ctr.columns[~df_ctr.columns.str.contains('neg_')]
229
+ # df_ctr[cols_clip] = df_ctr[cols_clip].clip(lower=-1)
230
+ # df_exp[cols_clip] = df_exp[cols_clip].clip(lower=-1)
231
+ # df_group_signal = df_exp - df_ctr
232
+ df_exp.columns = 'exp_' + df_exp.columns
233
+ df_ctr.columns = 'ctr_' + df_ctr.columns
234
+ df_score = pd.concat([df_score, df_exp, df_ctr], axis=1)
235
+ else:
236
+ df_score = pd.concat([df_score, df_exp], axis=1)
237
+ df_score = df_score.copy()
238
+ df_score.to_csv(output)
239
+
240
+ ##########################
241
+ ## filter and normalize ##
242
+ ##########################
243
+ output = f'./temp/df_result_{outname}.csv'
244
+ if (os.path.isfile(output))&(not args.overwrite):
245
+ print(f'skip {outname} as the result exists')
246
+ df_result = pd.read_csv(output, index_col=0)
247
+ else:
248
+ if pattern_ctr != 'none':
249
+ # 重算 proximal_signal 和 pct_score,因为 clip 了
250
+ cols_exp_L = list('exp_L_' + pd.Series(flank_regions).astype(str))
251
+ cols_exp_R = list('exp_R_' + pd.Series(flank_regions).astype(str))
252
+ cols_ctr_L = list('ctr_L_' + pd.Series(flank_regions).astype(str))
253
+ cols_ctr_R = list('ctr_R_' + pd.Series(flank_regions).astype(str))
254
+ cols_exp_L_pct_score = list('exp_L_pct_score_' + pd.Series(flank_regions).astype(str))
255
+ cols_exp_R_pct_score = list('exp_R_pct_score_' + pd.Series(flank_regions).astype(str))
256
+ cols_ctr_L_pct_score = list('ctr_L_pct_score_' + pd.Series(flank_regions).astype(str))
257
+ cols_ctr_R_pct_score = list('ctr_R_pct_score_' + pd.Series(flank_regions).astype(str))
258
+ df_score['exp_L_mean'] = df_score[cols_exp_L].mean(axis=1)
259
+ df_score['exp_R_mean'] = df_score[cols_exp_R].mean(axis=1)
260
+ df_score['ctr_L_mean'] = df_score[cols_ctr_L].clip(lower=args.CtrClip).mean(axis=1)
261
+ df_score['ctr_R_mean'] = df_score[cols_ctr_R].clip(lower=args.CtrClip).mean(axis=1)
262
+ df_score['exp_L_mean_pct_score'] = df_score[cols_exp_L_pct_score].mean(axis=1)
263
+ df_score['exp_R_mean_pct_score'] = df_score[cols_exp_R_pct_score].mean(axis=1)
264
+ df_score['ctr_L_mean_pct_score'] = df_score[cols_ctr_L_pct_score].clip(lower=args.CtrClip).mean(axis=1)
265
+ df_score['ctr_R_mean_pct_score'] = df_score[cols_ctr_R_pct_score].clip(lower=args.CtrClip).mean(axis=1)
266
+ df_score['L_mean'] = df_score['exp_L_mean'] - df_score['ctr_L_mean']
267
+ df_score['R_mean'] = df_score['exp_R_mean'] - df_score['ctr_R_mean']
268
+ df_score['L_mean_pct_score'] = df_score['exp_L_mean_pct_score'] - df_score['ctr_L_mean_pct_score']
269
+ df_score['R_mean_pct_score'] = df_score['exp_R_mean_pct_score'] - df_score['ctr_R_mean_pct_score']
270
+ df_score['L_length'] = df_score['exp_L_length'] - df_score['ctr_L_length']
271
+ df_score['R_length'] = df_score['exp_R_length'] - df_score['ctr_R_length']
272
+ df_score['signal_length'] = df_score['L_length'] + df_score['R_length']
273
+ df_score['proximal_signal'] = df_score['L_mean'] + df_score['R_mean']
274
+ df_score['pct_score'] = df_score['L_mean_pct_score'] + df_score['R_mean_pct_score']
275
+
276
+ # 整理表格
277
+ mean_seq_score = round(df_score['best_seq_score'].mean(),3)
278
+ df_score['norm_best_seq_score'] = np.power(df_score['best_seq_score']/mean_seq_score, seq_score_power)
279
+ df_score['final_score_1'] = df_score[f'proximal_signal']*df_score['norm_best_seq_score']
280
+ df_score['final_score_2'] = df_score['pct_score']*df_score['norm_best_seq_score']
281
+ #df_score['final_score_2'] = df_score[f'overall_signal']*df_score['norm_best_seq_score']
282
+ df_score['raw_score'] = df_score['final_score_1'] + df_score['final_score_2']
283
+ df_score = df_score.sort_values('raw_score', ascending=False)
284
+
285
+ # local dedup
286
+ list_nondup = offtracker.dedup_two(df_score,'ID_1','ID_2')
287
+ df_result = df_score[list_nondup].copy()
288
+
289
+ # 标准化分布
290
+ target_std=0.15
291
+ n_outliers = int(np.ceil(len(df_result)*0.01))
292
+ score_bkg = df_result['raw_score'][n_outliers:-n_outliers]
293
+ mean_score_bkg = score_bkg.mean()
294
+ std_score_bkg = score_bkg.std()
295
+ df_result['track_score'] = (df_result[f'raw_score'] - mean_score_bkg) / std_score_bkg
296
+ df_result['track_score'] = df_result[f'track_score']*target_std + 1
297
+ df_result = df_result.sort_values(by='track_score', ascending=False)
298
+ df_result['log2_track_score'] = np.log2(df_result[f'track_score'].clip(lower=0.5))
299
+
300
+ # 单边信号周围有更高分的,去掉
301
+ # v2.1 后 cols_L, cols_R 要手动
302
+ if pattern_ctr != 'none':
303
+ cols_L = ['exp_L_1000', 'exp_L_2000']
304
+ cols_R = ['exp_R_1000', 'exp_R_2000']
305
+ else:
306
+ cols_L = ['L_1000', 'L_2000'] # df_score.columns[df_score.columns.str.contains('^L_\d+')]
307
+ cols_R = ['R_1000', 'R_2000'] # df_score.columns[df_score.columns.str.contains('^R_\d+')]
308
+ seq_score_thresh = np.power(1.25, seq_score_power)
309
+ search_distance = 100000
310
+ candidate_dup = list(df_result[((df_result[cols_R].max(axis=1)<=0)|(df_result[cols_L].max(axis=1)<=0))&(df_result['log2_track_score']>0.8)].index)
311
+ list_dedup = []
312
+ for a_loc in candidate_dup:
313
+ temp_chr = df_result.loc[a_loc,'chr']
314
+ # 如果序列特别像就不过滤
315
+ temp_seq_score = df_result.loc[a_loc,'norm_best_seq_score']
316
+ if temp_seq_score > seq_score_thresh:
317
+ continue
318
+ # 取排其前且同chr者
319
+ temp_df_result = df_result.loc[:a_loc]
320
+ temp_df_result = temp_df_result[temp_df_result['chr'] == temp_chr].iloc[:-1]
321
+ if len(temp_df_result)==0:
322
+ continue
323
+ else:
324
+ # 距离小于 search_distance 者记为信号溢出假阳性
325
+ if (temp_df_result['cleavage_site']-df_result.loc[a_loc,'cleavage_site']).abs().min()<search_distance :
326
+ list_dedup.append(a_loc)
327
+ # 去除重复
328
+ df_result = df_result[~df_result.index.isin(list_dedup)].copy()
329
+ # print(f'filter {len(list_dedup)} sites')
330
+
331
+ # fitting normal distribution
332
+ score_for_fitting = df_result['log2_track_score'][n_outliers:-n_outliers]
333
+ mu, std = norm.fit(score_for_fitting)
334
+ print('mean_score:{:.3f};std:{:.3f}'.format(mu,std))
335
+ # pv and fdr
336
+ df_result['pv'] = df_result[f'log2_track_score'].apply( lambda x: norm.sf(x,loc=mu,scale=std) )
337
+ df_result['pv'].clip(lower=1e-320,inplace=True)
338
+ df_result['fdr'] = offtracker.fdr(df_result['pv'])
339
+ df_result['rank'] = range(1,len(df_result)+1)
340
+ df_result.to_csv(output)
341
+
342
+ df_output = df_result[df_result['fdr']<=0.05].copy()
343
+ if pattern_ctr != 'none':
344
+ df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
345
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
346
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
347
+ df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
348
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
349
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
350
+ else:
351
+ df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
352
+ 'L_length', 'R_length','signal_length',
353
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
354
+ df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
355
+ 'L_length', 'R_length','signal_length',
356
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
357
+ df_output.to_csv(f'Offtracker_result_{outname}.csv', index=False)
358
+
359
+ if args.clean:
360
+ shutil.rmtree('./temp')
361
+
362
+ return 'Done!'
363
+
364
+
365
+ if __name__ == '__main__' :
366
+ result = main()
367
+ print(result)
368
+
369
+