offtracker 1.0.2__zip → 2.7.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {offtracker-1.0.2/offtracker.egg-info → offtracker-2.7.7}/PKG-INFO +1 -1
  2. offtracker-2.7.7/offtracker/X_offplot.py +123 -0
  3. offtracker-2.7.7/offtracker/X_offtracker.py +338 -0
  4. offtracker-1.0.2/offtracker/X_general.py → offtracker-2.7.7/offtracker/X_sequence.py +18 -5
  5. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/__init__.py +1 -1
  6. offtracker-2.7.7/offtracker/_version.py +27 -0
  7. offtracker-2.7.7/offtracker/mapping/Snakefile_offtracker +245 -0
  8. offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +3846 -0
  9. offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +5827 -0
  10. {offtracker-1.0.2 → offtracker-2.7.7/offtracker.egg-info}/PKG-INFO +1 -1
  11. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/SOURCES.txt +4 -3
  12. offtracker-2.7.7/scripts/offtracker_analysis.py +369 -0
  13. {offtracker-1.0.2 → offtracker-2.7.7}/scripts/offtracker_candidates.py +59 -101
  14. {offtracker-1.0.2 → offtracker-2.7.7}/scripts/offtracker_config.py +15 -10
  15. offtracker-1.0.2/offtracker/X_analysis.py +0 -332
  16. offtracker-1.0.2/offtracker/_version.py +0 -1
  17. offtracker-1.0.2/offtracker/mapping/Snakefile_Trackseq +0 -175
  18. offtracker-1.0.2/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +0 -22228
  19. offtracker-1.0.2/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +0 -9347
  20. offtracker-1.0.2/scripts/offtracker_analysis.py +0 -407
  21. {offtracker-1.0.2 → offtracker-2.7.7}/LICENSE.txt +0 -0
  22. {offtracker-1.0.2 → offtracker-2.7.7}/MANIFEST.in +0 -0
  23. {offtracker-1.0.2 → offtracker-2.7.7}/README.md +0 -0
  24. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/1.1_bed2fr_v4.5.py +0 -0
  25. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/1.3_bdg_normalize_v4.0.py +0 -0
  26. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/bedGraphToBigWig +0 -0
  27. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/hg38.chrom.sizes +0 -0
  28. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/mm10.chrom.sizes +0 -0
  29. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/dependency_links.txt +0 -0
  30. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/requires.txt +0 -0
  31. {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/top_level.txt +0 -0
  32. {offtracker-1.0.2 → offtracker-2.7.7}/setup.cfg +0 -0
  33. {offtracker-1.0.2 → offtracker-2.7.7}/setup.py +0 -0
@@ -1,332 +0,0 @@
1
-
2
- import pandas as pd
3
- import numpy as np
4
- import os, sys, pybedtools
5
- sys.path.append( os.path.abspath(os.path.dirname(__file__)) )
6
- from X_general import *
7
-
8
- def signal_formula1(signal, nonsignal, pseudocount):
9
- # 背景均值略小于 1
10
- assert pseudocount>0
11
- return signal/(nonsignal+pseudocount)
12
-
13
- def signal_formula2(signal, nonsignal, pseudocount):
14
- # 背景均值为 1
15
- assert pseudocount>0
16
- return (signal+pseudocount)/(nonsignal+pseudocount)
17
-
18
- def signal_formula3(signal, nonsignal, pseudocount):
19
- # 调整背景均值为 1
20
- assert pseudocount>0
21
- out = (signal - nonsignal + 1)
22
- out.loc[out<0] = 0
23
- return out
24
-
25
- def signal_formula4(signal, nonsignal, pseudocount):
26
- # 背景均值为 1
27
- assert pseudocount>0
28
- return signal
29
-
30
- def shape_formula1(normed_left_signal, normed_left_nonsignal, normed_right_signal, normed_right_nonsignal, bkg_noise, ratio_thresh=1.5, exp_weight = 0.3):
31
- # 防止破坏性的极小值出现
32
- normed_left_signal = normed_left_signal.clip(lower=1)
33
- normed_left_nonsignal = normed_left_nonsignal.clip(lower=1)
34
- normed_right_signal = normed_right_signal.clip(lower=1)
35
- normed_right_nonsignal = normed_right_nonsignal.clip(lower=1)
36
- left_ratio = normed_left_signal/normed_left_nonsignal
37
- right_ratio = normed_right_signal/normed_right_nonsignal
38
- #
39
- good_shape = pd.concat([left_ratio,right_ratio],axis=1).min(axis=1)>ratio_thresh
40
- left_ratio.loc[~good_shape]=1
41
- right_ratio.loc[~good_shape]=1
42
- bonus_coef = left_ratio*right_ratio
43
- bonus_coef = np.power(bonus_coef,exp_weight)
44
- return bonus_coef
45
-
46
- def shape_formula2(normed_left_signal, normed_left_nonsignal, normed_right_signal, normed_right_nonsignal, bkg_noise, ratio_thresh=1, exp_weight = 0.3):
47
- left_residual = normed_left_signal - normed_left_nonsignal
48
- right_residual = normed_right_signal - normed_right_nonsignal
49
- # 这里先取 max,也可以取 min 或者 mean
50
- good_left = left_residual>ratio_thresh
51
- good_right = right_residual>ratio_thresh
52
- good_shape = good_left&good_right
53
- left_residual.loc[~good_shape]=0
54
- right_residual.loc[~good_shape]=0
55
- bonus_coef = (left_residual+right_residual)/(2*ratio_thresh)
56
- bonus_coef = np.power(bonus_coef,exp_weight)
57
- bonus_coef = bonus_coef.clip(lower=1)
58
- return bonus_coef
59
-
60
- def score_formula1(exp, ctrl, pseudocount):
61
- assert pseudocount>0
62
- return exp/(max(ctrl,0)+pseudocount)
63
-
64
- def score_formula2(exp, ctrl, pseudocount):
65
- assert pseudocount>0
66
- return (exp+pseudocount)/(max(ctrl,0)+pseudocount)
67
-
68
- def score_formula3(exp, ctrl, pseudocount):
69
- assert pseudocount>0
70
- return exp/max(ctrl, pseudocount)
71
-
72
- def score_formula4(exp, ctrl, pseudocount):
73
- assert pseudocount>0
74
- return max((exp - ctrl), 0)
75
-
76
- def score_formula5(exp, ctrl, pseudocount):
77
- assert pseudocount>0
78
- return exp
79
-
80
- def fdr(p_vals):
81
- # Benjamini-Hochberg
82
- from scipy.stats import rankdata
83
- ranked_p_values = rankdata(p_vals)
84
- fdr = p_vals * len(p_vals) / ranked_p_values
85
- fdr[fdr > 1] = 1
86
- return fdr
87
-
88
- def dedup_two( df_loc, col_ID_1='ID_1', col_ID_2='ID_2'):
89
- # 会根据 df_loc 的排序保留第一个 location
90
- # dedup 结束后,剩下的 ID_1 + ID_2 并集可能会小于 dedup 前的并集
91
- list_nondup = []
92
- set_IDs = set()
93
- df_IDs = df_loc[[col_ID_1,col_ID_2]]
94
- for a_row in df_IDs.iterrows():
95
- temp = a_row[1]
96
- if (temp[col_ID_1] in set_IDs) or (temp[col_ID_2] in set_IDs):
97
- # 只要有一ID出现过,即便另一ID没出现过,也不更新 set_IDs
98
- list_nondup.append(False)
99
- else:
100
- set_IDs.add(temp[col_ID_1])
101
- set_IDs.add(temp[col_ID_2])
102
- list_nondup.append(True)
103
- return list_nondup
104
-
105
- def cand_count(a_pref, the_sgRNA, regions, seq_folder, dir_chrom_sizes, overwrite=False):
106
- #
107
- forward_bed = f'{a_pref}.fw.bed'
108
- reverse_bed = f'{a_pref}.rv.bed'
109
- # put into temp_dir
110
- dirname = os.path.dirname(a_pref)
111
- basename = os.path.basename(a_pref)
112
- temp_dir = os.path.join(dirname, 'temp')
113
- if not os.path.exists( temp_dir ):
114
- os.makedirs(temp_dir)
115
- a_pref = os.path.join(temp_dir, basename)
116
- #
117
- for a_region in regions:
118
- work = 1
119
- if os.path.isfile(f'{a_pref}_candidate_{the_sgRNA}_right_rstrand_{a_region}.count'):
120
- #print(f'Candidates for {the_sgRNA} within {a_region} of {basename} exists.\n')
121
- work = 0
122
- if overwrite:
123
- print('overwrite mode')
124
- work = 1
125
- if work == 1:
126
- print(f'Working for {basename} within {a_region} bp.\n')
127
- # 左侧区域
128
- left_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_left_{a_region}.bed')
129
- a = pybedtools.BedTool(left_region)
130
- b = pybedtools.BedTool(forward_bed)
131
- c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
132
- c.saveas(f'{a_pref}_candidate_{the_sgRNA}_left_fstrand_{a_region}.count')
133
- b = pybedtools.BedTool(reverse_bed)
134
- c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
135
- c.saveas(f'{a_pref}_candidate_{the_sgRNA}_left_rstrand_{a_region}.count')
136
- # 右侧区域
137
- right_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_right_{a_region}.bed')
138
- a = pybedtools.BedTool(right_region)
139
- b = pybedtools.BedTool(forward_bed)
140
- c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
141
- c.saveas(f'{a_pref}_candidate_{the_sgRNA}_right_fstrand_{a_region}.count')
142
- b = pybedtools.BedTool(reverse_bed)
143
- c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
144
- c.saveas(f'{a_pref}_candidate_{the_sgRNA}_right_rstrand_{a_region}.count')
145
-
146
- def bkg_count(a_pref, the_sgRNA, bkgs, seq_folder, dir_chrom_sizes, overwrite=False):
147
- # no need for LF and RR
148
- forward_bed = f'{a_pref}.fw.bed'
149
- reverse_bed = f'{a_pref}.rv.bed'
150
- # put into temp_dir
151
- dirname = os.path.dirname(a_pref)
152
- basename = os.path.basename(a_pref)
153
- temp_dir = os.path.join(dirname, 'temp')
154
- if not os.path.exists( temp_dir ):
155
- os.makedirs(temp_dir)
156
- a_pref = os.path.join(temp_dir, basename)
157
- #
158
- work = 1
159
- for a_region in bkgs:
160
- if os.path.isfile(f'{a_pref}_candidate_{the_sgRNA}_right_fstrand_{a_region}.count'):
161
- #print(f'Candidates for {the_sgRNA} within {a_region} of {basename} exists.\n')
162
- work = 0
163
- if overwrite:
164
- print('overwrite mode')
165
- work = 1
166
- if work == 1:
167
- print(f'Working for {basename} within {a_region} bp.\n')
168
- # 左侧区域
169
- left_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_left_{a_region}.bed')
170
- a = pybedtools.BedTool(left_region)
171
- b = pybedtools.BedTool(reverse_bed)
172
- c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
173
- c.saveas(f'{a_pref}_candidate_{the_sgRNA}_left_rstrand_{a_region}.count')
174
- # 右侧区域
175
- right_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_right_{a_region}.bed')
176
- a = pybedtools.BedTool(right_region)
177
- b = pybedtools.BedTool(forward_bed)
178
- c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
179
- c.saveas(f'{a_pref}_candidate_{the_sgRNA}_right_fstrand_{a_region}.count')
180
-
181
-
182
- def load_count( a_sample, regions, bkgs, sgRNA_name, signal_formula, noise_length,
183
- ratio_thresh, exp_weight, shape_formula=None,
184
- operator='p', region_op='mean', dirname='./', pseudo_count=1
185
- ):
186
- list_noise = []
187
- for a_bkg in bkgs:
188
- left_count_rstrand = os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_left_rstrand_{a_bkg}.count')
189
- left_count_rstrand = readbed(left_count_rstrand)
190
- right_count_fstrand= os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_right_fstrand_{a_bkg}.count')
191
- right_count_fstrand = readbed(right_count_fstrand)
192
- list_noise.append(right_count_fstrand[3])
193
- list_noise.append(left_count_rstrand[3])
194
- df_noise = pd.concat(list_noise,axis=1, ignore_index=True)
195
- df_noise['mean'] = df_noise.mean(axis=1)
196
- mean_all = df_noise['mean'].mean()
197
- df_noise['std'] = df_noise.std(axis=1)
198
- df_noise['outlier'] = df_noise['mean']+2*df_noise['std']
199
- for i in range(len(bkgs)*2):
200
- df_noise.loc[df_noise[i]>df_noise['outlier'],i] = np.nan
201
- df_noise['mean2'] = df_noise.iloc[:,:6].mean(axis=1, skipna = True)
202
- n_0bkg = sum(df_noise.mean2==0)
203
- if n_0bkg > 0:
204
- print(f'{n_0bkg} region(s) with 0 count in background.')
205
- # 由于有些位置可能出现无法 mapping 而产生大量空白,导致局部噪音过低,因此这里主要是防局部高噪音造成假阳性
206
- df_noise.loc[df_noise['mean2']<mean_all, 'mean2'] = mean_all
207
- df_noise['noise_bp'] = df_noise['mean2']/noise_length
208
- noise_5kb = df_noise['noise_bp'].mean()*5000
209
- print('Average noise within 5kb on a single strand: {:.2f}'.format(noise_5kb))
210
- if noise_5kb < 10:
211
- print('The sequencing depth might be too shallow')
212
- list_df_counts = []
213
- for a_region in regions:
214
- left_count_fstrand= os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_left_fstrand_{a_region}.count')
215
- left_count_fstrand = readbed(left_count_fstrand)
216
- left_count_fstrand.columns=[f'chr_{a_region}',f'st_left_{a_region}',f'ed_left_{a_region}',f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'length_{a_region}',f'cover_left_F_pct_{a_region}']
217
- right_count_fstrand=os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_right_fstrand_{a_region}.count')
218
- right_count_fstrand = readbed(right_count_fstrand)
219
- right_count_fstrand.columns=[f'chr_{a_region}',f'st_right_{a_region}',f'ed_right_{a_region}',f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'length_{a_region}',f'cover_right_F_pct_{a_region}']
220
- left_count_rstrand= os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_left_rstrand_{a_region}.count')
221
- left_count_rstrand = readbed(left_count_rstrand)
222
- left_count_rstrand.columns=[f'chr_{a_region}',f'st_left_{a_region}',f'ed_left_{a_region}',f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'length_{a_region}',f'cover_left_R_pct_{a_region}']
223
- right_count_rstrand=os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_right_rstrand_{a_region}.count')
224
- right_count_rstrand = readbed(right_count_rstrand)
225
- right_count_rstrand.columns=[f'chr_{a_region}',f'st_right_{a_region}',f'ed_right_{a_region}',f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'length_{a_region}',f'cover_right_R_pct_{a_region}']
226
-
227
- df_counts = pd.concat([left_count_fstrand[[f'chr_{a_region}',f'st_left_{a_region}', f'ed_left_{a_region}',
228
- f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'cover_left_F_pct_{a_region}']],
229
- left_count_rstrand[[f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'cover_left_R_pct_{a_region}']],
230
- right_count_fstrand[[f'ed_right_{a_region}',f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'cover_right_F_pct_{a_region}']],
231
- right_count_rstrand[[f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'cover_right_R_pct_{a_region}']]],axis=1)
232
-
233
- df_counts = df_counts.reindex(columns=[f'chr_{a_region}', f'st_left_{a_region}', f'ed_right_{a_region}', f'ed_left_{a_region}',
234
- f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'cover_left_F_pct_{a_region}',
235
- f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'cover_left_R_pct_{a_region}',
236
- f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'cover_right_F_pct_{a_region}',
237
- f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'cover_right_R_pct_{a_region}'])
238
- df_counts.columns = [f'chr_{a_region}', f'st_{a_region}', f'ed_{a_region}', f'midpoint_{a_region}',
239
- f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'cover_left_F_pct_{a_region}',
240
- f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'cover_left_R_pct_{a_region}',
241
- f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'cover_right_F_pct_{a_region}',
242
- f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'cover_right_R_pct_{a_region}']
243
-
244
- # signal enrichment = Cs/(Cn+B)
245
- bkg_noise = df_noise['noise_bp']*a_region
246
- normed_left_signal = df_counts[f'counts_left_F_{a_region}']/bkg_noise
247
- normed_left_nonsignal = df_counts[f'counts_left_R_{a_region}']/bkg_noise
248
- normed_right_signal = df_counts[f'counts_right_R_{a_region}']/bkg_noise
249
- normed_right_nonsignal = df_counts[f'counts_right_F_{a_region}']/bkg_noise
250
- df_counts[f'N_LF_{a_region}'] = normed_left_signal
251
- df_counts[f'N_LR_{a_region}'] = normed_left_nonsignal
252
- df_counts[f'N_RR_{a_region}'] = normed_right_signal
253
- df_counts[f'N_RF_{a_region}'] = normed_right_nonsignal
254
- # 可变公式区
255
- df_counts[f'left_signal_{a_region}'] = signal_formula(normed_left_signal, normed_left_nonsignal, pseudo_count)
256
- df_counts[f'right_signal_{a_region}'] = signal_formula(normed_right_signal, normed_right_nonsignal, pseudo_count)
257
- if shape_formula:
258
- df_counts[f'shape_bonus_{a_region}'] = shape_formula(normed_left_signal, normed_left_nonsignal,
259
- normed_right_signal, normed_right_nonsignal,
260
- bkg_noise, ratio_thresh, exp_weight)
261
- else:
262
- df_counts[f'shape_bonus_{a_region}'] = 1
263
- list_df_counts.append(df_counts)
264
- df_counts = pd.concat(list_df_counts,axis=1)
265
- left_signal_cols = df_counts.columns[ df_counts.columns.str.contains('left_signal_') ]
266
- right_signal_cols = df_counts.columns[ df_counts.columns.str.contains('right_signal_') ]
267
-
268
- print('region_op',region_op)
269
- if region_op == 'mean':
270
- df_counts['left_signal'] = df_counts[left_signal_cols].mean(axis=1)
271
- df_counts['right_signal'] = df_counts[right_signal_cols].mean(axis=1)
272
- elif region_op == 'max':
273
- df_counts['left_signal'] = df_counts[left_signal_cols].max(axis=1)
274
- df_counts['right_signal'] = df_counts[right_signal_cols].max(axis=1)
275
- elif region_op == 'min':
276
- df_counts['left_signal'] = df_counts[left_signal_cols].min(axis=1)
277
- df_counts['right_signal'] = df_counts[right_signal_cols].min(axis=1)
278
- else:
279
- raise Exception('region_op should be "mean", "max", or "min" ')
280
-
281
- max_region = max(regions)
282
- df_counts['chr'] = df_counts[f'chr_{max_region}']
283
- df_counts['st'] = df_counts[f'st_{max_region}']
284
- df_counts['ed'] = df_counts[f'ed_{max_region}']
285
- df_counts['midpoint'] = df_counts[f'midpoint_{max_region}']
286
- df_counts = df_counts.reindex( columns= ['chr','st','ed','midpoint'] + list(df_counts.columns) )
287
- df_counts = df_counts.loc[:,~df_counts.columns.duplicated()].copy()
288
- print('Raw regions:', len(df_counts))
289
-
290
- # operator 选择
291
- if operator == 'p':
292
- df_counts['score'] = (df_counts['left_signal']+df_counts['right_signal'])/2
293
- elif operator == 'm':
294
- df_counts['score'] = np.power(df_counts['left_signal']*df_counts['right_signal'],0.5)
295
-
296
- #
297
- min_region = min(regions)
298
-
299
- if shape_formula:
300
- #
301
- shape_bonus_cols = df_counts.columns[ df_counts.columns.str.contains('shape_bonus_') ]
302
- df_counts['shape_bonus'] = df_counts[shape_bonus_cols].mean(axis=1)
303
-
304
-
305
- df_counts = df_counts.sort_values(by='score',ascending=False).reset_index(drop=True)
306
-
307
- ### 一些其他特征
308
-
309
- # 左右最小范围信号绝对强度比值与差值
310
- max_adjacent_ratio = df_counts[[f'left_signal_{min_region}',f'right_signal_{min_region}']].max(axis=1)
311
- min_adjacent_ratio = df_counts[[f'left_signal_{min_region}',f'right_signal_{min_region}']].min(axis=1)
312
- df_counts['signal_FC'] = max_adjacent_ratio/(min_adjacent_ratio+0.001)
313
-
314
- # 左右500bp小信号边若为负数,可能是假的
315
- df_counts['left_signal_residual'] = df_counts[f'N_LF_{min_region}'] - df_counts[f'N_LR_{min_region}']
316
- df_counts['right_signal_residual'] = df_counts[f'N_RR_{min_region}'] - df_counts[f'N_RF_{min_region}']
317
- min_adjacent_signal = df_counts[['left_signal_residual','right_signal_residual']].min(axis=1)
318
- df_counts['signal_min'] = min_adjacent_signal
319
-
320
- # 具体位置去重
321
- df_counts['location'] = igvfmt(df_counts)
322
- df_counts = df_counts.drop_duplicates(subset='location').reset_index(drop=True).copy()
323
-
324
- # 第二版 unique_ID
325
- point_head = (df_counts['midpoint']/1000).astype(int)
326
- df_counts['ID_1'] = df_counts['chr'] + ':' + point_head.astype(str)
327
- point_tail = df_counts['midpoint'] % 1000
328
- df_counts.loc[point_tail<500,'ID_2'] = df_counts['chr'] + ':' + (point_head-1).astype(str)
329
- df_counts.loc[point_tail>=500,'ID_2'] = df_counts['chr'] + ':' + (point_head+1).astype(str)
330
-
331
- return df_counts, df_noise
332
-
@@ -1 +0,0 @@
1
- __version__ = "1.0.2"
@@ -1,175 +0,0 @@
1
-
2
- configfile: "config.yaml"
3
-
4
- _threads = config["thread"]
5
- BinSize = str(config["binsize"])
6
- output_dir = config["output_dir"]
7
- nametype = config["nametype"]
8
- suffix = config["suffix"]
9
- name1 = nametype.replace('2','1') + '.' + suffix
10
- name2 = nametype + '.' + suffix
11
-
12
- import os
13
-
14
-
15
- rule all:
16
- input:
17
- expand( os.path.join(output_dir,"{sample}.fw.bed"), sample=config["sample"] ),
18
- expand( os.path.join(output_dir,"{sample}.rv.bed"), sample=config["sample"] ),
19
- expand( os.path.join(output_dir,"{sample}.fw.scaled.bw"), sample=config["sample"] ),
20
- expand( os.path.join(output_dir,"{sample}.rv.scaled.bw"), sample=config["sample"] ),
21
-
22
-
23
- rule chromap:
24
- input:
25
- R1= lambda w: config["sample"][w.sample] + name1,
26
- R2= lambda w: config["sample"][w.sample] + name2
27
- threads:
28
- _threads
29
- params:
30
- index=config["index"],
31
- fasta=config["fasta"]
32
- output:
33
- temp(os.path.join(output_dir,"{sample}.chromapx.bed"))
34
- shell:
35
- """
36
- chromap -l 2000 --low-mem --BED --remove-pcr-duplicates \
37
- --min-read-length 10 --allocate-multi-mappings \
38
- -x {params.index} -r {params.fasta} -t {threads} -1 {input.R1} -2 {input.R2} -o {output}
39
- """
40
-
41
- if config["blacklist"] != 'none':
42
- rule remove_blacklist:
43
- input:
44
- os.path.join(output_dir,"{sample}.chromapx.bed")
45
- threads:
46
- _threads
47
- params:
48
- blacklist=config["blacklist"]
49
- output:
50
- temp(os.path.join(output_dir,"{sample}.filtered.bed"))
51
- shell:
52
- "bedtools intersect -a {input} -b {params.blacklist} -v > {output}"
53
-
54
- rule bed2fr:
55
- input:
56
- os.path.join(output_dir,"{sample}.filtered.bed")
57
- threads:
58
- _threads
59
- params:
60
- dir_script=config["script_folder"]
61
- output:
62
- fw=os.path.join(output_dir,"{sample}.fw.bed"),
63
- rv=os.path.join(output_dir,"{sample}.rv.bed")
64
- shell:
65
- "python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
66
- else:
67
- rule bed2fr:
68
- input:
69
- os.path.join(output_dir,"{sample}.chromapx.bed")
70
- threads:
71
- _threads
72
- params:
73
- dir_script=config["script_folder"]
74
- output:
75
- fw=os.path.join(output_dir,"{sample}.fw.bed"),
76
- rv=os.path.join(output_dir,"{sample}.rv.bed")
77
- shell:
78
- "python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
79
-
80
- rule bed2bdg_fw:
81
- input:
82
- os.path.join(output_dir,"{sample}.fw.bed")
83
- threads:
84
- _threads
85
- params:
86
- gl=config["genomelen"]
87
- output:
88
- temp(os.path.join(output_dir,"{sample}.fw.bdg"))
89
- shell:
90
- "bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
91
-
92
- rule bed2bdg_rv:
93
- input:
94
- os.path.join(output_dir,"{sample}.rv.bed")
95
- threads:
96
- _threads
97
- params:
98
- gl=config["genomelen"]
99
- output:
100
- temp(os.path.join(output_dir,"{sample}.rv.bdg"))
101
- shell:
102
- "bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
103
-
104
- rule bdg_sort_fw:
105
- input:
106
- fw=os.path.join(output_dir,"{sample}.fw.bdg")
107
- threads:
108
- _threads
109
- output:
110
- temp(os.path.join(output_dir,"{sample}.fw.sorted.bdg"))
111
- shell:
112
- "bedtools sort -i {input.fw} > {output}"
113
-
114
- rule bdg_sort_rv:
115
- input:
116
- rv=os.path.join(output_dir,"{sample}.rv.bdg")
117
- threads:
118
- _threads
119
- output:
120
- temp(os.path.join(output_dir,"{sample}.rv.sorted.bdg"))
121
- shell:
122
- "bedtools sort -i {input.rv} > {output}"
123
-
124
- rule bdg_normalize_fw:
125
- input:
126
- bdg=os.path.join(output_dir,"{sample}.fw.sorted.bdg"),
127
- bed=os.path.join(output_dir,"{sample}.fw.bed")
128
- threads:
129
- _threads
130
- params:
131
- dir_script=config["script_folder"]
132
- output:
133
- temp(os.path.join(output_dir,"{sample}.fw.scaled.bdg"))
134
- shell:
135
- "python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
136
-
137
- rule bdg_normalize_rv:
138
- input:
139
- bdg=os.path.join(output_dir,"{sample}.rv.sorted.bdg"),
140
- bed=os.path.join(output_dir,"{sample}.rv.bed")
141
- threads:
142
- _threads
143
- params:
144
- dir_script=config["script_folder"]
145
- output:
146
- temp(os.path.join(output_dir,"{sample}.rv.scaled.bdg"))
147
- shell:
148
- "python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
149
-
150
- rule bdg2bw_fw:
151
- input:
152
- os.path.join(output_dir,"{sample}.fw.scaled.bdg")
153
- threads:
154
- _threads
155
- params:
156
- gl=config["genomelen"],
157
- dir_script=config["script_folder"]
158
- output:
159
- os.path.join(output_dir,"{sample}.fw.scaled.bw")
160
- shell:
161
- "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
162
-
163
- rule bdg2bw_rv:
164
- input:
165
- os.path.join(output_dir,"{sample}.rv.scaled.bdg")
166
- threads:
167
- _threads
168
- params:
169
- gl=config["genomelen"],
170
- dir_script=config["script_folder"]
171
- output:
172
- os.path.join(output_dir,"{sample}.rv.scaled.bw")
173
- shell:
174
- "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
175
-